From d110a8e6561b3005446f2701503695b8ee150fe9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 10 Mar 2026 14:13:37 -0700 Subject: [PATCH 1/6] Initial backport to Linux 4.18.0 --- Makefile | 7 +- cloudlab/bin/update_linux | 2 +- homa_devel.c | 2 + homa_grant.c | 3 +- homa_grant.h | 2 +- homa_impl.h | 20 ++- homa_incoming.c | 23 +-- homa_metrics.c | 10 +- homa_offload.c | 180 +++++++------------ homa_offload.h | 7 +- homa_outgoing.c | 9 +- homa_peer.c | 21 +-- homa_peer.h | 2 +- homa_plumbing.c | 60 +++---- homa_qdisc.c | 12 +- homa_qdisc.h | 4 +- homa_rpc.c | 5 +- homa_skb.c | 28 ++- homa_wire.h | 8 +- murmurhash3.h | 3 +- notes.txt | 6 + test/Makefile | 6 +- test/mock.c | 235 +++++++++--------------- test/mock.h | 15 +- test/rbtree.c | 95 +++++++++- test/rhashtable.c | 363 +++++++++++++++++++------------------- test/unit_homa_incoming.c | 17 -- test/unit_homa_offload.c | 278 ++++++++++++----------------- test/unit_homa_outgoing.c | 4 +- test/unit_homa_peer.c | 17 +- test/unit_homa_plumbing.c | 92 ++++------ test/unit_homa_skb.c | 8 +- test/unit_homa_sock.c | 7 +- timetrace.c | 10 +- 34 files changed, 728 insertions(+), 833 deletions(-) diff --git a/Makefile b/Makefile index d68e49b8..401ad45c 100644 --- a/Makefile +++ b/Makefile @@ -33,8 +33,10 @@ ifneq ($(KERNEL_SRC),) KDIR ?= $(KERNEL_SRC) endif -LINUX_VERSION ?= $(shell uname -r) +# LINUX_VERSION ?= $(shell uname -r) +LINUX_VERSION := 4.18.0+ KDIR ?= /lib/modules/$(LINUX_VERSION)/build +CC = gcc-8 LINUX_SRC_DIR ?= ../net-next @@ -104,3 +106,6 @@ printClean-%: $(MAKE) -C $(KDIR) M=$(shell pwd) $@ endif + +# Prevents warnings related to the __init annotation for homa_load. +CFLAGS_homa_plumbing.o += -Wno-missing-attributes diff --git a/cloudlab/bin/update_linux b/cloudlab/bin/update_linux index fe22b543..c67f12b0 100755 --- a/cloudlab/bin/update_linux +++ b/cloudlab/bin/update_linux @@ -16,7 +16,7 @@ # "first" defaults to 1. v=`uname -r` -#v=5.17.7+ +v=4.18.0+ if [ $# -eq 2 ]; then first=$2 diff --git a/homa_devel.c b/homa_devel.c index 272aede9..0627bfa9 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -1266,6 +1266,8 @@ void homa_validate_rbtree(struct rb_node *node, int depth, char *message) tt_printk(); BUG_ON(1); } +#else + return; #endif /* __UNIT_TEST__ */ } #endif /* See strip.py */ diff --git a/homa_grant.c b/homa_grant.c index 80e90b78..e8bc76ff 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -74,6 +74,7 @@ static struct ctl_table grant_ctl_table[] = { .mode = 0644, .proc_handler = homa_grant_dointvec }, + {} }; #endif /* See strip.py */ @@ -1166,7 +1167,7 @@ void homa_grant_update_sysctl_deps(struct homa_grant *grant) * * Return: 0 for success, nonzero for error. */ -int homa_grant_dointvec(const struct ctl_table *table, int write, +int homa_grant_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table table_copy; diff --git a/homa_grant.h b/homa_grant.h index 3fa7a8f8..1326b1eb 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -236,7 +236,7 @@ void homa_grant_cand_check(struct homa_grant_candidates *cand, struct homa_grant *grant); void homa_grant_check_fifo(struct homa_grant *grant); void homa_grant_check_rpc(struct homa_rpc *rpc); -int homa_grant_dointvec(const struct ctl_table *table, int write, +int homa_grant_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); void homa_grant_end_rpc(struct homa_rpc *rpc); void homa_grant_find_oldest(struct homa_grant *grant); diff --git a/homa_impl.h b/homa_impl.h index 61c4c912..f88f32c3 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -37,11 +37,10 @@ #include #include #include +#include #include #include #include -#include -#include #ifndef __UPSTREAM__ /* See strip.py */ #include "homa.h" @@ -72,6 +71,11 @@ struct homa_peer; struct homa_rpc; struct homa_sock; +/* Features not present in all kernels: */ +#ifndef __cond_acquires +#define __cond_acquires(x) +#endif + #ifndef __STRIP__ /* See strip.py */ #include "timetrace.h" #include "homa_metrics.h" @@ -711,8 +715,8 @@ int homa_copy_to_user(struct homa_rpc *rpc); void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc); void homa_destroy(struct homa *homa); void homa_dispatch_pkts(struct sk_buff *skb); -int homa_err_handler_v4(struct sk_buff *skb, u32 info); -int homa_err_handler_v6(struct sk_buff *skb, +void homa_err_handler_v4(struct sk_buff *skb, u32 info); +void homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info); int homa_fill_data_interleaved(struct homa_rpc *rpc, @@ -739,7 +743,7 @@ int homa_net_start(struct net *net); __poll_t homa_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len); + int flags, int noblock, int *addr_len); void homa_request_retrans(struct homa_rpc *rpc); void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, struct homa_sock *hsk); @@ -747,7 +751,7 @@ void homa_rpc_handoff(struct homa_rpc *rpc); int homa_rpc_tx_end(struct homa_rpc *rpc); int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int homa_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen); + char __user *optval, unsigned int optlen); int homa_shutdown(struct socket *sock, int how); int homa_socket(struct sock *sk); int homa_softirq(struct sk_buff *skb); @@ -771,7 +775,7 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk); #ifndef __STRIP__ /* See strip.py */ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk); -int homa_dointvec(const struct ctl_table *table, int write, +int homa_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); void homa_incoming_sysctl_changed(struct homa *homa); int homa_ioc_abort(struct socket *sock, unsigned long arg); @@ -780,7 +784,7 @@ int homa_message_in_init(struct homa_rpc *rpc, int length, void homa_prios_changed(struct homa *homa); void homa_resend_data(struct homa_rpc *rpc, int start, int end, int priority); -int homa_sysctl_softirq_cores(const struct ctl_table *table, +int homa_sysctl_softirq_cores(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, diff --git a/homa_incoming.c b/homa_incoming.c index 06181802..64fb372f 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -167,13 +167,11 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) struct homa_gap *gap, *dummy, *gap2; int start = ntohl(h->seg.offset); int length = homa_data_len(skb); - enum skb_drop_reason reason; int end = start + length; if ((start + length) > rpc->msgin.length) { tt_record3("Packet extended past message end; id %d, offset %d, length %d", rpc->id, start, length); - reason = SKB_DROP_REASON_PKT_TOO_BIG; goto discard; } @@ -189,7 +187,6 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) rpc->msgin.recv_end, start)) { tt_record2("Couldn't allocate gap for id %d (start %d): no memory", rpc->id, start); - reason = SKB_DROP_REASON_NOMEM; goto discard; } rpc->msgin.recv_end = end; @@ -207,13 +204,11 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) if (start < gap->start) { tt_record4("Packet overlaps gap start: id %d, start %d, end %d, gap_start %d", rpc->id, start, end, gap->start); - reason = SKB_DROP_REASON_DUP_FRAG; goto discard; } if (end > gap->end) { tt_record4("Packet overlaps gap end: id %d, start %d, end %d, gap_end %d", rpc->id, start, end, gap->start); - reason = SKB_DROP_REASON_DUP_FRAG; goto discard; } gap->start = end; @@ -233,7 +228,6 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) if (end > gap->end) { tt_record4("Packet overlaps gap end: id %d, start %d, end %d, gap_end %d", rpc->id, start, end, gap->start); - reason = SKB_DROP_REASON_DUP_FRAG; goto discard; } gap->end = start; @@ -245,7 +239,6 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) if (!gap2) { tt_record2("Couldn't allocate gap for split for id %d (start %d): no memory", rpc->id, end); - reason = SKB_DROP_REASON_NOMEM; goto discard; } gap2->time = gap->time; @@ -262,7 +255,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) #endif /* See strip.py */ tt_record4("homa_add_packet discarding packet for id %d, offset %d, length %d, retransmit %d", rpc->id, start, length, h->retransmit); - kfree_skb_reason(skb, reason); + kfree_skb(skb); return; keep: @@ -358,6 +351,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) int offset = ntohl(h->seg.offset); int buf_bytes, chunk_size; struct iov_iter iter; + struct iovec iov; int copied = 0; char __user *dst; @@ -377,13 +371,12 @@ int homa_copy_to_user(struct homa_rpc *rpc) } chunk_size = buf_bytes; } - error = import_ubuf(READ, dst, chunk_size, - &iter); - if (error) - goto free_skbs; + iov.iov_base = dst; + iov.iov_len = chunk_size; + iov_iter_init(&iter, READ, &iov, 1, chunk_size); error = skb_copy_datagram_iter(skbs[i], sizeof(*h) + - copied, &iter, + copied, &iter, chunk_size); if (error) goto free_skbs; @@ -457,8 +450,8 @@ void homa_dispatch_pkts(struct sk_buff *skb) hsk = homa_sock_find(hnet, dport); if (!hsk || (!homa_is_client(id) && !hsk->is_server)) { if (skb_is_ipv6(skb)) - icmp6_send(skb, ICMPV6_DEST_UNREACH, - ICMPV6_PORT_UNREACH, 0, NULL, IP6CB(skb)); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, 0); else icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); diff --git a/homa_metrics.c b/homa_metrics.c index dee3b123..fd0e82f4 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -9,11 +9,11 @@ DEFINE_PER_CPU(struct homa_metrics, homa_metrics); /* Describes file operations implemented for /proc/net/homa_metrics. */ -static const struct proc_ops homa_metrics_ops = { - .proc_open = homa_metrics_open, - .proc_read = homa_metrics_read, - .proc_lseek = homa_metrics_lseek, - .proc_release = homa_metrics_release, +static const struct file_operations homa_metrics_ops = { + .open = homa_metrics_open, + .read = homa_metrics_read, + .llseek = homa_metrics_lseek, + .release = homa_metrics_release, }; /* Global information used to export metrics information through a file in diff --git a/homa_offload.c b/homa_offload.c index 812b567c..04b79ae7 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -8,6 +8,7 @@ #include "homa_offload.h" #include "homa_pacer.h" #include "homa_qdisc.h" +#include "homa_wire.h" DEFINE_PER_CPU(struct homa_offload_core, homa_offload_core); @@ -43,7 +44,7 @@ static struct net_offload hook_tcp6_net_offload; */ int homa_offload_init(void) { - int i; + int i, res1, res2; for (i = 0; i < nr_cpu_ids; i++) { struct homa_offload_core *offload_core; @@ -62,8 +63,8 @@ int homa_offload_init(void) offload_core->held_bucket = 0; } - int res1 = inet_add_offload(&homa_offload, IPPROTO_HOMA); - int res2 = inet6_add_offload(&homa_offload, IPPROTO_HOMA); + res1 = inet_add_offload(&homa_offload, IPPROTO_HOMA); + res2 = inet6_add_offload(&homa_offload, IPPROTO_HOMA); return res1 ? res1 : res2; } @@ -132,12 +133,12 @@ void homa_gro_unhook_tcp(void) * homa_tcp_gro_receive() - Invoked instead of TCP's normal gro_receive function * when hooking is enabled. Identifies Homa-over-TCP packets and passes them * to Homa; sends real TCP packets to TCP's gro_receive function. - * @held_list: Pointer to header for list of packets that are being + * @gro_list: Pointer to pointer to first in list of packets that are being * held for possible GRO merging. * @skb: The newly arrived packet. */ -struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, - struct sk_buff *skb) +struct sk_buff **homa_tcp_gro_receive(struct sk_buff **gro_list, + struct sk_buff *skb) { struct homa_common_hdr *h = (struct homa_common_hdr *) skb_transport_header(skb); @@ -147,7 +148,7 @@ struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, // ntohs(h->urgent), homa_local_id(h->sender_id)); if (h->flags != HOMA_TCP_FLAGS || ntohs(h->urgent) != HOMA_TCP_URGENT) - return tcp_net_offload->callbacks.gro_receive(held_list, skb); + return tcp_net_offload->callbacks.gro_receive(gro_list, skb); /* Change the packet's IP protocol to Homa so that it will get * dispatched directly to Homa in the future. @@ -160,7 +161,7 @@ struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, htons(IPPROTO_HOMA)); ip_hdr(skb)->protocol = IPPROTO_HOMA; } - return homa_gro_receive(held_list, skb); + return homa_gro_receive(gro_list, skb); } /** @@ -171,17 +172,17 @@ struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, * @cpu: Index of core to which the packet should be directed for * SoftIRQ processing. */ -static void homa_set_softirq_cpu(struct sk_buff *skb, int cpu) +void homa_set_softirq_cpu(struct sk_buff *skb, int cpu) { struct rps_sock_flow_table *sock_flow_table; int hash; rcu_read_lock(); - sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); + sock_flow_table = rcu_dereference(rps_sock_flow_table); if (sock_flow_table) { - hash = cpu + net_hotdata.rps_cpu_mask + 1; + hash = cpu + rps_cpu_mask + 1; if (sock_flow_table->ents[hash] != hash) { - sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); + sock_flow_table = rcu_dereference(rps_sock_flow_table); sock_flow_table->ents[hash] = hash; } __skb_set_sw_hash(skb, hash, false); @@ -266,17 +267,15 @@ struct sk_buff *homa_gso_segment(struct sk_buff *skb, * unusual way: it simply aggregates all packets targeted to a particular * destination port, so that the entire bundle can get through the networking * stack in a single traversal. - * @held_list: Pointer to header for list of packets that are being - * held for possible GRO merging. Note: this list contains - * only packets matching a given hash. + * @gro_list: Pointer to pointer to first in list of packets that are being + * held for possible GRO merging. * @skb: The newly arrived packet. * - * Return: If the return value is non-NULL, it refers to an skb in - * gro_list. The skb will be removed from the list by the caller and - * passed up the stack immediately. + * Return: If the return value is non-NULL, it refers to a link in + * gro_list. The skb referred to by that link should be removed from the + * list by the caller and passed up the stack immediately. */ -struct sk_buff *homa_gro_receive(struct list_head *held_list, - struct sk_buff *skb) +struct sk_buff **homa_gro_receive(struct sk_buff **gro_list, struct sk_buff *skb) { /* This function will do one of the following things: * 1. Merge skb with a packet in gro_list by appending it to @@ -291,14 +290,14 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, struct homa *homa = homa_net(dev_net(skb->dev))->homa; u64 saved_softirq_metric, softirq_cycles; struct homa_offload_core *offload_core; - struct sk_buff *result = NULL; + struct sk_buff **result = NULL; struct homa_data_hdr *h_new; u64 *softirq_cycles_metric; struct sk_buff *held_skb; u64 now = homa_clock(); + struct sk_buff **pp; int priority; u32 saddr; - u32 hash; int busy; if (!homa_make_header_avl(skb)) @@ -360,109 +359,54 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, #endif /* See strip.py */ } - /* The GRO mechanism tries to separate packets onto different - * gro_lists by hash. This is bad for us, because we want to batch - * packets together regardless of their RPCs. So, instead of - * checking the list they gave us, check the last list where this - * core added a Homa packet (if there is such a list). - */ - hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); - if (offload_core->held_skb) { - /* Reverse-engineer the location of the gro_node, so we - * can verify that held_skb is still valid. - */ - struct gro_list *gro_list = container_of(held_list, - struct gro_list, list); -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 15, 0) - struct napi_struct *napi = container_of(gro_list, - struct napi_struct, gro_hash[hash]); -#else - struct gro_node *gro_node = container_of(gro_list, - struct gro_node, hash[hash]); -#endif + h_new->common.gro_count = 1; + for (pp = gro_list; (held_skb = *pp) != NULL; pp = &held_skb->next) { + struct homa_common_hdr *h_held; + int protocol; + + h_held = (struct homa_common_hdr *)skb_transport_header( + held_skb); - /* Must verify that offload_core->held_skb points to a packet on - * the list, and that the packet is a Homa packet. - * homa_gro_complete isn't always invoked before removing - * packets from the list, so offload_core->held_skb could be a - * dangling pointer (or the skb could have been reused for - * some other protocol). + /* Packets can be batched together as long as they are all + * Homa packets, even if they are from different RPCs. Don't + * use the same_flow mechanism that is normally used in + * gro_receive, because it won't allow packets from different + * sources to be aggregated. */ - list_for_each_entry(held_skb, -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 15, 0) - &napi->gro_hash[offload_core->held_bucket].list, -#else - &gro_node->hash[offload_core->held_bucket].list, -#endif - list) { - int protocol; - - if (held_skb != offload_core->held_skb) - continue; - if (skb_is_ipv6(held_skb)) - protocol = ipv6_hdr(held_skb)->nexthdr; - else - protocol = ip_hdr(held_skb)->protocol; - if (protocol != IPPROTO_HOMA) { - tt_record3("homa_gro_receive held_skb 0x%0x%0x isn't Homa: protocol %d", - tt_hi(held_skb), tt_lo(held_skb), - protocol); - continue; - } + if (skb_is_ipv6(held_skb)) + protocol = ipv6_hdr(held_skb)->nexthdr; + else + protocol = ip_hdr(held_skb)->protocol; + if (protocol != IPPROTO_HOMA) + continue; - /* Aggregate skb into held_skb. We don't update the - * length of held_skb because we'll eventually split - * it up and process each skb independently. - */ - if (NAPI_GRO_CB(held_skb)->last == held_skb) - skb_shinfo(held_skb)->frag_list = skb; - else - NAPI_GRO_CB(held_skb)->last->next = skb; - NAPI_GRO_CB(held_skb)->last = skb; - skb->next = NULL; - NAPI_GRO_CB(skb)->same_flow = 1; - NAPI_GRO_CB(held_skb)->count++; - if (NAPI_GRO_CB(held_skb)->count >= homa->max_gro_skbs) { - /* Push this batch up through the SoftIRQ - * layer. This code is a hack, needed because - * returning skb as result is no longer - * sufficient (as of 5.4.80) to push it up - * the stack; the packet just gets queued on - * gro_node->rx_list. This code basically steals - * the packet from dev_gro_receive and - * pushes it upward. - */ - skb_list_del_init(held_skb); - homa_gro_complete(held_skb, 0); - netif_receive_skb(held_skb); - homa_send_ipis(); -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 15, 0) - napi->gro_hash[offload_core->held_bucket].count--; - if (napi->gro_hash[offload_core->held_bucket].count == 0) - __clear_bit(offload_core->held_bucket, - &napi->gro_bitmask); -#else - gro_node->hash[offload_core->held_bucket].count--; - if (gro_node->hash[offload_core->held_bucket].count == 0) - __clear_bit(offload_core->held_bucket, - &gro_node->bitmask); -#endif - result = ERR_PTR(-EINPROGRESS); - } - goto done; - } + /* Aggregate skb into held_skb. We don't update the length of + * held_skb, because we'll eventually split it up and process + * each skb independently. + */ + if (NAPI_GRO_CB(held_skb)->last == held_skb) + skb_shinfo(held_skb)->frag_list = skb; + else + NAPI_GRO_CB(held_skb)->last->next = skb; + NAPI_GRO_CB(held_skb)->last = skb; + skb->next = NULL; + NAPI_GRO_CB(skb)->same_flow = 1; + NAPI_GRO_CB(held_skb)->count++; + h_held->gro_count++; + if (h_held->gro_count >= homa->max_gro_skbs) + result = pp; + goto done; } /* There was no existing Homa packet that this packet could be - * batched with, so this packet will become the new merge_skb. - * If the packet is sent up the stack before another packet - * arrives for batching, we want it to be processed on this same - * core (it's faster that way, and if batching doesn't occur it - * means we aren't heavily loaded; if batching does occur, - * homa_gro_complete will pick a different core). + * batched with, so this packet will now go on gro_list for future + * packets to be batched with. If the packet is sent up the stack + * before another packet arrives for batching, we want it to be + * processed on this same core (it's faster that way, and if + * batching doesn't occur it means we aren't heavily loaded; if + * batching does occur, homa_gro_complete will pick a different + * core). */ - offload_core->held_skb = skb; - offload_core->held_bucket = hash; if (likely(homa->gro_policy & HOMA_GRO_SAME_CORE)) homa_set_softirq_cpu(skb, smp_processor_id()); diff --git a/homa_offload.h b/homa_offload.h index 936230e2..722c65c3 100644 --- a/homa_offload.h +++ b/homa_offload.h @@ -79,16 +79,17 @@ void homa_gro_gen3(struct homa *homa, struct sk_buff *skb); void homa_gro_hook_tcp(void); void homa_gro_unhook_tcp(void); #endif /* See strip.py */ -struct sk_buff *homa_gro_receive(struct list_head *gro_list, +struct sk_buff **homa_gro_receive(struct sk_buff **gro_list, struct sk_buff *skb); struct sk_buff *homa_gso_segment(struct sk_buff *skb, netdev_features_t features); int homa_offload_end(void); int homa_offload_init(void); void homa_send_ipis(void); +void homa_set_softirq_cpu(struct sk_buff *skb, int cpu); #ifndef __STRIP__ /* See strip.py */ -struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, - struct sk_buff *skb); +struct sk_buff **homa_tcp_gro_receive(struct sk_buff **gro_list, + struct sk_buff *skb); #endif /* See strip.py */ #endif /* _HOMA_OFFLOAD_H */ diff --git a/homa_outgoing.c b/homa_outgoing.c index 12cce77a..af22593c 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -476,8 +476,7 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, #ifndef __STRIP__ /* See strip.py */ if (hsk->inet.sk.sk_family == AF_INET6) { result = ip6_xmit(&hsk->inet.sk, skb, &peer->flow.u.ip6, 0, - NULL, hsk->homa->priority_map[priority] << 5, - 0); + NULL, hsk->homa->priority_map[priority] << 5); } else { /* This will find its way to the DSCP field in the IPv4 hdr. */ hsk->inet.tos = hsk->homa->priority_map[priority] << 5; @@ -498,7 +497,7 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, #else /* See strip.py */ if (hsk->inet.sk.sk_family == AF_INET6) result = ip6_xmit(&hsk->inet.sk, skb, &peer->flow.u.ip6, 0, - NULL, 0, 0); + NULL, 0); else result = ip_queue_xmit(&hsk->inet.sk, skb, &peer->flow); #endif /* See strip.py */ @@ -688,10 +687,10 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc) #ifndef __STRIP__ /* See strip.py */ err = ip6_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow.u.ip6, 0, NULL, - rpc->hsk->homa->priority_map[priority] << 5, 0); + rpc->hsk->homa->priority_map[priority] << 5); #else /* See strip.py */ ip6_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow.u.ip6, - 0, NULL, 0, 0); + 0, NULL, 0); #endif /* See strip.py */ } else { tt_record4("calling ip_queue_xmit: wire_bytes %d, peer 0x%x, id %d, offset %d", diff --git a/homa_peer.c b/homa_peer.c index 6aa36a0c..a5d4c24e 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -64,6 +64,7 @@ static struct ctl_table peer_ctl_table[] = { .mode = 0644, .proc_handler = homa_peer_dointvec }, + {} }; #endif /* See strip.py */ @@ -516,7 +517,7 @@ struct dst_entry *homa_get_dst(struct homa_peer *peer, struct homa_sock *hsk) */ int homa_peer_reset_dst(struct homa_peer *peer, struct homa_sock *hsk) { - struct dst_entry *dst; + struct dst_entry *dst, *old; int result = 0; homa_peer_lock(peer); @@ -530,8 +531,7 @@ int homa_peer_reset_dst(struct homa_peer *peer, struct homa_sock *hsk) ipv6_to_ipv4(peer->addr), hsk->inet.inet_saddr, 0, 0, hsk->sock.sk_uid); - security_sk_classify_flow(&hsk->sock, - &peer->flow.u.__fl_common); + security_sk_classify_flow(&hsk->sock, &peer->flow); rt = ip_route_output_flow(sock_net(&hsk->sock), &peer->flow.u.ip4, &hsk->sock); if (IS_ERR(rt)) { @@ -553,16 +553,15 @@ int homa_peer_reset_dst(struct homa_peer *peer, struct homa_sock *hsk) peer->flow.u.ip6.fl6_dport = 0; peer->flow.u.ip6.fl6_sport = 0; peer->flow.u.ip6.flowi6_uid = hsk->sock.sk_uid; - security_sk_classify_flow(&hsk->sock, - &peer->flow.u.__fl_common); - dst = ip6_dst_lookup_flow(sock_net(&hsk->sock), &hsk->sock, - &peer->flow.u.ip6, NULL); + security_sk_classify_flow(&hsk->sock, &peer->flow); + dst = ip6_dst_lookup_flow(&hsk->sock, &peer->flow.u.ip6, + &peer->addr); if (IS_ERR(dst)) { result = PTR_ERR(dst); INC_METRIC(peer_route_errors, 1); goto done; } - peer->dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); + peer->dst_cookie = rt6_get_cookie((struct rt6_info *)dst); } /* From the standpoint of homa_get_dst, peer->dst is not updated @@ -572,7 +571,9 @@ int homa_peer_reset_dst(struct homa_peer *peer, struct homa_sock *hsk) * a lost packet) or a valid dst to be replaced (resulting in * unnecessary work). */ - dst_release(rcu_replace_pointer(peer->dst, dst, true)); + old = rcu_dereference_protected(peer->dst, lockdep_is_held(&peer->lock)); + rcu_assign_pointer(peer->dst, dst); + dst_release(old); done: homa_peer_unlock(peer); @@ -729,7 +730,7 @@ void homa_peer_update_sysctl_deps(struct homa_peertab *peertab) * * Return: 0 for success, nonzero for error. */ -int homa_peer_dointvec(const struct ctl_table *table, int write, +int homa_peer_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct homa_peertab *peertab; diff --git a/homa_peer.h b/homa_peer.h index 65d6701a..e5df965a 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -288,7 +288,7 @@ struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, const struct in6_addr *addr); struct homa_peertab *homa_peer_alloc_peertab(void); -int homa_peer_dointvec(const struct ctl_table *table, int write, +int homa_peer_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); void homa_peer_free(struct rcu_head *head); void homa_peer_free_net(struct homa_net *hnet); diff --git a/homa_plumbing.c b/homa_plumbing.c index e8a4a020..6c4b9e39 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -119,7 +119,6 @@ static struct proto homav6_prot = { .hash = homa_hash, .unhash = homa_unhash, .obj_size = sizeof(struct homa_v6_sock), - .ipv6_pinfo_offset = offsetof(struct homa_v6_sock, inet6), .no_autobind = 1, }; @@ -145,6 +144,7 @@ static struct net_protocol homa_protocol = { .handler = homa_softirq, .err_handler = homa_err_handler_v4, .no_policy = 1, + .netns_ok = 1, }; static struct inet6_protocol homav6_protocol = { @@ -412,6 +412,7 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_dointvec }, + {} }; #endif /* See strip.py */ @@ -995,7 +996,7 @@ int homa_socket(struct sock *sk) * on errors. */ int homa_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) + char __user *optval, unsigned int optlen) { struct homa_sock *hsk = homa_sk(sk); int ret; @@ -1011,12 +1012,12 @@ int homa_setsockopt(struct sock *sk, int level, int optname, u64 start = homa_clock(); #endif /* See strip.py */ - if (optlen != sizeof(struct homa_rcvbuf_args)) { + if (optlen != sizeof(args)) { hsk->error_msg = "invalid optlen argument: must be sizeof(struct homa_rcvbuf_args)"; return -EINVAL; } - if (copy_from_sockptr(&args, optval, optlen)) { + if (unlikely(copy_from_user(&args, optval, optlen))) { hsk->error_msg = "invalid address for homa_rcvbuf_args"; return -EFAULT; } @@ -1042,7 +1043,7 @@ int homa_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; } - if (copy_from_sockptr(&arg, optval, optlen)) { + if (unlikely(copy_from_user(&arg, optval, optlen))) { hsk->error_msg = "invalid address for SO_HOMA_SERVER value"; return -EFAULT; } @@ -1080,7 +1081,7 @@ int homa_getsockopt(struct sock *sk, int level, int optname, void *result; int len; - if (copy_from_sockptr(&len, USER_SOCKPTR(optlen), sizeof(int))) { + if (unlikely(copy_from_user(&len, optlen, sizeof(int)))) { hsk->error_msg = "invalid address for optlen argument to getsockopt"; return -EFAULT; } @@ -1114,12 +1115,12 @@ int homa_getsockopt(struct sock *sk, int level, int optname, return -ENOPROTOOPT; } - if (copy_to_sockptr(USER_SOCKPTR(optlen), &len, sizeof(int))) { + if (copy_to_user(optlen, &len, sizeof(int))) { hsk->error_msg = "couldn't update optlen argument to getsockopt: read-only?"; return -EFAULT; } - if (copy_to_sockptr(USER_SOCKPTR(optval), result, len)) { + if (copy_to_user(optval, result, len)) { hsk->error_msg = "couldn't update optval argument to getsockopt: read-only?"; return -EFAULT; } @@ -1159,12 +1160,6 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) goto error; } - if (unlikely(!msg->msg_control_is_user)) { - tt_record("homa_sendmsg error: !msg->msg_control_is_user"); - hsk->error_msg = "msg_control argument for sendmsg isn't in user space"; - result = -EINVAL; - goto error; - } if (unlikely(copy_from_user(&args, (void __user *)msg->msg_control, sizeof(args)))) { hsk->error_msg = "invalid address for msg_control argument to sendmsg"; @@ -1222,8 +1217,8 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) args.id = rpc->id; homa_rpc_unlock(rpc); /* Locked by homa_rpc_alloc_client. */ - if (unlikely(copy_to_user((void __user *)msg->msg_control, - &args, sizeof(args)))) { + if (unlikely(copy_to_user((void __user *)msg->msg_control, &args, + sizeof(args)))) { homa_rpc_lock(rpc); hsk->error_msg = "couldn't update homa_sendmsg_args argument to sendmsg: read-only?"; result = -EFAULT; @@ -1309,16 +1304,16 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) * @len: Total bytes of space available in msg->msg_iov; not used. * @flags: Flags from system call; only MSG_DONTWAIT is used. * @addr_len: Store the length of the sender address here + * @noblock: Non-zero means MSG_DONTWAIT was specified * Return: The length of the message on success, otherwise a negative * errno. Sets hsk->error_msg on errors. */ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, - int *addr_len) + int noblock, int *addr_len) { struct homa_sock *hsk = homa_sk(sk); struct homa_recvmsg_args control; struct homa_rpc *rpc = NULL; - int nonblocking; int result; IF_NO_STRIP(u64 start = homa_clock()); @@ -1371,7 +1366,6 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, goto done; } - nonblocking = flags & MSG_DONTWAIT; if (control.id != 0) { rpc = homa_rpc_find_client(hsk, control.id); /* Locks RPC. */ if (!rpc) { @@ -1380,14 +1374,14 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, goto done; } homa_rpc_hold(rpc); - result = homa_wait_private(rpc, nonblocking); + result = homa_wait_private(rpc, noblock); if (result != 0) { hsk->error_msg = "error while waiting for private RPC to complete"; control.id = 0; goto done; } } else { - rpc = homa_wait_shared(hsk, nonblocking); + rpc = homa_wait_shared(hsk, noblock); if (IS_ERR(rpc)) { /* If we get here, it means there was an error that * prevented us from finding an RPC to return. Errors @@ -1528,7 +1522,6 @@ int homa_softirq(struct sk_buff *skb) { struct sk_buff *packets, *other_pkts, *next; struct sk_buff **prev_link, **other_link; - enum skb_drop_reason reason; struct homa_common_hdr *h; int header_offset; @@ -1566,7 +1559,6 @@ int homa_softirq(struct sk_buff *skb) pr_notice("Homa can't handle fragmented packet (no space for header); discarding\n"); #endif /* See strip.py */ UNIT_LOG("", "pskb discard"); - reason = SKB_DROP_REASON_HDR_TRUNC; goto discard; } header_offset = skb_transport_header(skb) - skb->data; @@ -1588,7 +1580,6 @@ int homa_softirq(struct sk_buff *skb) skb->len - header_offset); #endif /* See strip.py */ INC_METRIC(short_packets, 1); - reason = SKB_DROP_REASON_PKT_TOO_SMALL; goto discard; } @@ -1607,7 +1598,6 @@ int homa_softirq(struct sk_buff *skb) homa_local_id(h->sender_id)); tt_freeze(); } - reason = SKB_CONSUMED; goto discard; } #endif /* See strip.py */ @@ -1629,7 +1619,7 @@ int homa_softirq(struct sk_buff *skb) discard: *prev_link = skb->next; - kfree_skb_reason(skb, reason); + kfree_skb(skb); } /* Now process the longer packets. Each iteration of this loop @@ -1690,10 +1680,8 @@ int homa_softirq(struct sk_buff *skb) * the ICMP header (the first byte of the embedded packet IP header). * @skb: The incoming packet. * @info: Information about the error that occurred? - * - * Return: zero, or a negative errno if the error couldn't be handled here. */ -int homa_err_handler_v4(struct sk_buff *skb, u32 info) +void homa_err_handler_v4(struct sk_buff *skb, u32 info) { struct homa *homa = homa_net(dev_net(skb->dev))->homa; const struct icmphdr *icmp = icmp_hdr(skb); @@ -1723,7 +1711,6 @@ int homa_err_handler_v4(struct sk_buff *skb, u32 info) } if (error != 0) homa_abort_rpcs(homa, &daddr, port, error); - return 0; } /** @@ -1736,10 +1723,8 @@ int homa_err_handler_v4(struct sk_buff *skb, u32 info) * @code: Additional information about the error. * @offset: Not used. * @info: Information about the error that occurred? - * - * Return: zero, or a negative errno if the error couldn't be handled here. */ -int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, +void homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; @@ -1760,7 +1745,6 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, } if (error != 0) homa_abort_rpcs(homa, &iph->daddr, port, error); - return 0; } /** @@ -1781,7 +1765,7 @@ __poll_t homa_poll(struct file *file, struct socket *sock, __poll_t mask; mask = 0; - sock_poll_wait(file, sock, wait); + sock_poll_wait(file, sk_sleep(sock->sk), wait); tt_record2("homa_poll found sk_wmem_alloc %d, sk_sndbuf %d", refcount_read(&hsk->sock.sk_wmem_alloc), hsk->sock.sk_sndbuf); @@ -1812,7 +1796,7 @@ __poll_t homa_poll(struct file *file, struct socket *sock, * * Return: 0 for success, nonzero for error. */ -int homa_dointvec(const struct ctl_table *table, int write, +int homa_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct homa *homa = homa_net(current->nsproxy->net_ns)->homa; @@ -1901,7 +1885,7 @@ int homa_dointvec(const struct ctl_table *table, int write, * * Return: 0 for success, nonzero for error. */ -int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, +int homa_sysctl_softirq_cores(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct homa_offload_core *offload_core; @@ -2011,7 +1995,7 @@ int homa_timer_main(void *transport) homa_timer(homa); } hrtimer_cancel(&hrtimer); - kthread_complete_and_exit(&timer_thread_done, 0); + complete_and_exit(&timer_thread_done, 0); return 0; } diff --git a/homa_qdisc.c b/homa_qdisc.c index f16552fe..2fd88b0d 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -123,6 +123,7 @@ static struct ctl_table homa_qdisc_ctl_table[] = { .mode = 0644, .proc_handler = homa_qdisc_dointvec }, + {} }; static struct Qdisc_ops homa_qdisc_ops __read_mostly = { @@ -428,8 +429,7 @@ void homa_qdisc_destroy(struct Qdisc *qdisc) spin_lock_bh(&q->qdev->defer_lock); while (!skb_queue_empty(&q->deferred_tcp)) - kfree_skb_reason(__skb_dequeue(&q->deferred_tcp), - SKB_DROP_REASON_QDISC_DROP); + kfree_skb(__skb_dequeue(&q->deferred_tcp)); list_del_init(&q->defer_links); if (q->qdev->congested_qdisc == q) q->qdev->congested_qdisc = NULL; @@ -761,7 +761,7 @@ int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) tt_record_tcp("homa_qdisc_pacer requeued TCP packet from " "0x%x to 0x%x, data bytes %d, seq/ack %u", skb, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); - homa_qdisc_schedule_skb(skb, qdisc_from_priv(q)); + homa_qdisc_schedule_skb(skb, q->qdisc); homa_qdisc_update_congested(q); return pkt_len; } @@ -897,7 +897,7 @@ int homa_qdisc_xmit_deferred_homa(struct homa_qdisc_dev *qdev) homa_qdisc_schedule_skb(skb, qdisc); homa_qdisc_update_congested(qdisc_priv(qdisc)); } else { - kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); + kfree_skb(skb); } rcu_read_unlock_bh(); return pkt_len; @@ -916,7 +916,7 @@ void homa_qdisc_free_homa(struct homa_qdisc_dev *qdev) skb = homa_qdisc_get_deferred_homa(qdev); if (!skb) break; - kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE); + kfree_skb(skb); } } @@ -1153,7 +1153,7 @@ void homa_qdisc_pacer_check(struct homa *homa) * * Return: 0 for success, nonzero for error. */ -int homa_qdisc_dointvec(const struct ctl_table *table, int write, +int homa_qdisc_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table table_copy; diff --git a/homa_qdisc.h b/homa_qdisc.h index 47aa07c7..c59c974a 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -317,7 +317,7 @@ void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, void homa_qdisc_defer_tcp(struct homa_qdisc *q, struct sk_buff *skb); void homa_qdisc_destroy(struct Qdisc *sch); void homa_qdisc_dev_callback(struct rcu_head *head); -int homa_qdisc_dointvec(const struct ctl_table *table, int write, +int homa_qdisc_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); @@ -436,7 +436,7 @@ static inline int homa_qdisc_bytes_pending(struct homa_qdisc *q) /* Ideally this function would be provided by dynamic_queue_limits.h * so that we don't have to root around in its data structures. */ - struct dql *dql = &qdisc_from_priv(q)->dev_queue->dql; + struct dql *dql = &q->qdisc->dev_queue->dql; return READ_ONCE(dql->num_queued) - READ_ONCE(dql->num_completed); } diff --git a/homa_rpc.c b/homa_rpc.c index 1b6a36e2..41afa94c 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -579,10 +579,9 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) * buffers left. */ if (rpc->msgin.length >= 0 && - !skb_queue_empty_lockless(&rpc->msgin.packets)) { + !skb_queue_empty(&rpc->msgin.packets)) { rx_frees += skb_queue_len(&rpc->msgin.packets); - __skb_queue_purge_reason(&rpc->msgin.packets, - SKB_CONSUMED); + __skb_queue_purge(&rpc->msgin.packets); } /* If we get here, it means all packets have been diff --git a/homa_skb.c b/homa_skb.c index 58f10807..01783960 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -17,9 +17,23 @@ extern int mock_max_skb_frags; #define HOMA_MAX_SKB_FRAGS MAX_SKB_FRAGS #endif +/* This function was added to later versions of the kernel, but isn't + * available in this version. + */ +static inline void skb_len_add(struct sk_buff *skb, int delta) +{ + skb->len += delta; + skb->data_len += delta; + skb->truesize += delta; +} + static void frag_page_set(skb_frag_t *frag, struct page *page) { +#ifdef CONFIG_NETMEM frag->netmem = page_to_netmem(page); +#else + __skb_frag_set_page(frag, page); +#endif } /** @@ -190,7 +204,7 @@ void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) frag = &shinfo->frags[shinfo->nr_frags - 1]; if (skb_frag_page(frag) == skb_core->skb_page && skb_core->page_inuse < skb_core->page_size && - (frag->offset + skb_frag_size(frag)) == + (frag->page_offset + skb_frag_size(frag)) == skb_core->page_inuse) { if ((skb_core->page_size - skb_core->page_inuse) < actual_size) @@ -220,7 +234,7 @@ void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) shinfo->nr_frags++; frag_page_set(frag, skb_core->skb_page); get_page(skb_core->skb_page); - frag->offset = skb_core->page_inuse; + frag->page_offset = skb_core->page_inuse; *length = actual_size; skb_frag_size_set(frag, actual_size); result = page_address(skb_frag_page(frag)) + skb_core->page_inuse; @@ -418,7 +432,7 @@ int homa_skb_append_from_skb(struct homa *homa, struct sk_buff *dst_skb, dst_shinfo->nr_frags++; frag_page_set(dst_frag, skb_frag_page(src_frag)); get_page(skb_frag_page(src_frag)); - dst_frag->offset = src_frag->offset + dst_frag->page_offset = src_frag->page_offset + (offset - src_frag_offset); skb_frag_size_set(dst_frag, chunk_size); offset += chunk_size; @@ -567,9 +581,9 @@ void homa_skb_get(struct sk_buff *skb, void *dest, int offset, int length) chunk_size = skb_frag_size(frag) - (offset - frag_offset); if (chunk_size > length) chunk_size = length; - memcpy(dst, page_address(skb_frag_page(frag)) + frag->offset - + (offset - frag_offset), - chunk_size); + memcpy(dst, page_address(skb_frag_page(frag)) + + frag->page_offset + (offset - frag_offset), + chunk_size); offset += chunk_size; length -= chunk_size; dst += chunk_size; @@ -585,7 +599,7 @@ void homa_skb_get(struct sk_buff *skb, void *dest, int offset, int length) void homa_skb_release_pages(struct homa *homa) { int i, max_low_mark, min_pages, release, release_max; - struct homa_page_pool *max_pool; + struct homa_page_pool *max_pool = NULL; u64 now = homa_clock(); if (now < homa->skb_page_free_time) diff --git a/homa_wire.h b/homa_wire.h index 948be506..e86e5f91 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -152,10 +152,12 @@ struct homa_common_hdr { #endif /* See strip.py */ /** - * @window: Corresponds to the window field in TCP headers. Not used - * by HOMA. + * @gro_count: Corresponds to the window field in TCP headers, which + * isn't used by Homa. Value on the wire is undefined. Used only by + * homa_offload.c (it counts the total number of packets aggregated + * into this packet, including the top-level packet). */ - __be16 window; + __u16 gro_count; /** * @checksum: Not used by Homa, but must occupy the same bytes as diff --git a/murmurhash3.h b/murmurhash3.h index 1ed1f0b6..e76a219f 100644 --- a/murmurhash3.h +++ b/murmurhash3.h @@ -18,9 +18,10 @@ static inline u32 murmurhash3(const void *data, u32 len, u32 seed) const u32 c2 = 0x1b873593; const u32 *key = data; u32 h = seed; + size_t i; len = len >> 2; - for (size_t i = 0; i < len; i++) { + for (i = 0; i < len; i++) { u32 k = key[i]; k *= c1; diff --git a/notes.txt b/notes.txt index b008fc3d..eaab54a3 100755 --- a/notes.txt +++ b/notes.txt @@ -1,5 +1,11 @@ Notes for Homa implementation in Linux: --------------------------------------- +* Backporting to 4.18.0: + * Things to port forward: + * Change to tt_unfreeze + * Changes in tthoma.py + * Changes in homa_qdisc.c + * Changes in test/Makefile * (12/12/25) Something is wrong with the xl170 cluster: both Homa and TCP are showing considerably worse performance than previously. I tried multiple diff --git a/test/Makefile b/test/Makefile index a56c54a7..8b7e037d 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,6 +1,7 @@ # Makefile to run unit tests for Homa -LINUX_VERSION ?= $(shell uname -r) +LINUX_VERSION := 4.18.0+ +# LINUX_VERSION ?= $(shell uname -r) KDIR ?= /lib/modules/$(LINUX_VERSION)/build LINUX_SRC_DIR ?= /ouster/linux-stable CC ?= gcc @@ -37,7 +38,8 @@ endif WARNS := -Wall -Wundef -Wno-trigraphs -Wno-sign-compare -Wuninitialized \ -Wno-strict-aliasing -Wunused-but-set-variable -Werror -CFLAGS := $(WARNS) -Wstrict-prototypes -MD -no-pie -g $(CINCLUDES) $(DEFS) +CFLAGS := $(WARNS) -Wstrict-prototypes -Wno-pointer-sign -MD -no-pie \ + -g $(CINCLUDES) $(DEFS) CCFLAGS := -std=c++11 $(WARNS) -MD -g $(CCINCLUDES) $(DEFS) -fsanitize=address TEST_SRCS := unit_homa_incoming.c \ diff --git a/test/mock.c b/test/mock.c index 6f453746..850d1136 100644 --- a/test/mock.c +++ b/test/mock.c @@ -15,6 +15,7 @@ #include "utils.h" #include +#include /* It isn't safe to include some header files, such as stdlib, because * they conflict with kernel header files. The explicit declarations @@ -287,20 +288,17 @@ unsigned int nr_cpu_ids = 8; unsigned long page_offset_base; unsigned long phys_base; unsigned long vmemmap_base; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) -kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES]; -#endif +struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; int __preempt_count; + +/* Value that will be returned by smp_processor_id. */ int cpu_number = 1; + char sock_flow_table[RPS_SOCK_FLOW_TABLE_SIZE(1024)]; -struct net_hotdata net_hotdata = { - .rps_cpu_mask = 0x1f, - .rps_sock_flow_table = (struct rps_sock_flow_table *) sock_flow_table -}; +struct rps_sock_flow_table *rps_sock_flow_table + = (struct rps_sock_flow_table *) sock_flow_table; +__u32 rps_cpu_mask = 0x1f; int debug_locks; -struct static_call_key __SCK__cond_resched; -struct static_call_key __SCK__might_resched; -struct static_call_key __SCK__preempt_schedule; struct paravirt_patch_template pv_ops; struct workqueue_struct *system_wq; struct static_key_true validate_usercopy_range; @@ -315,6 +313,12 @@ extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) {} +int alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *locks_mask, + size_t max_size, unsigned int cpu_mult, gfp_t gfp) +{ + return 0; +} + struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node) { @@ -372,6 +376,11 @@ bool cancel_work_sync(struct work_struct *work) void __check_object_size(const void *ptr, unsigned long n, bool to_user) {} +int _cond_resched(void) +{ + return 0; +} + void consume_skb(struct sk_buff *skb) { kfree_skb(skb); } @@ -388,7 +397,7 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *iter) return 0; } while (bytes_left > 0) { - struct iovec *iov = (struct iovec *) iter_iov(iter); + struct iovec *iov = (struct iovec *)iter->iov; u64 int_base = (u64) iov->iov_base; size_t chunk_bytes = iov->iov_len; @@ -401,7 +410,7 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *iter) iov->iov_base = (void *) (int_base + chunk_bytes); iov->iov_len -= chunk_bytes; if (iov->iov_len == 0) - iter->__iov++; + iter->iov++; } return bytes; } @@ -467,7 +476,7 @@ int debug_lockdep_rcu_enabled(void) } #endif -int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *) +int do_wait_intr_irq(wait_queue_head_t *head, wait_queue_entry_t *entry) { UNIT_HOOK("do_wait_intr_irq"); if (mock_check_error(&mock_wait_intr_irq_errors)) @@ -479,8 +488,8 @@ void dst_release(struct dst_entry *dst) { if (!dst) return; - atomic_dec(&dst->__rcuref.refcnt); - if (atomic_read(&dst->__rcuref.refcnt) > 0) + atomic_dec(&dst->__refcnt); + if (atomic_read(&dst->__refcnt) > 0) return; if (!routes_in_use || unit_hash_get(routes_in_use, dst) == NULL) { FAIL(" %s on unknown route", __func__); @@ -494,7 +503,7 @@ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) {} -void get_random_bytes(void *buf, size_t nbytes) +void get_random_bytes(void *buf, int nbytes) { memset(buf, 0, nbytes); } @@ -540,15 +549,12 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 range_ns, const enum hrtimer_mode mode) {} -void __icmp_send(struct sk_buff *skb, int type, int code, __be32 info, - const struct ip_options *opt) +void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) { unit_log_printf("; ", "icmp_send type %d, code %d", type, code); } -void icmp6_send(struct sk_buff *skb, u8 type, u8 code, u32 info, - const struct in6_addr *force_saddr, - const struct inet6_skb_parm *parm) +void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { unit_log_printf("; ", "icmp6_send type %d, code %d", type, code); } @@ -558,8 +564,8 @@ int idle_cpu(int cpu) return mock_check_error(&mock_cpu_idle); } -ssize_t import_iovec(int type, const struct iovec __user *uvector, - unsigned int nr_segs, unsigned int fast_segs, +int import_iovec(int type, const struct iovec __user *uvector, + unsigned nr_segs, unsigned fast_segs, struct iovec **iov, struct iov_iter *iter) { ssize_t size; @@ -577,14 +583,6 @@ ssize_t import_iovec(int type, const struct iovec __user *uvector, return size; } -int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) -{ - if (mock_check_error(&mock_import_ubuf_errors)) - return -EACCES; - iov_iter_ubuf(i, rw, buf, len); - return 0; -} - int inet6_add_offload(const struct net_offload *prot, unsigned char protocol) { return 0; @@ -695,16 +693,15 @@ void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) {} void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, - struct lock_class_key *) + struct lock_class_key *key) {} -void iov_iter_init(struct iov_iter *i, unsigned int direction, - const struct iovec *iov, unsigned long nr_segs, - size_t count) +void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov, + unsigned long nr_segs, size_t count) { direction &= READ | WRITE; - i->iter_type = ITER_IOVEC | direction; - i->__iov = iov; + i->type = ITER_IOVEC | direction; + i->iov = iov; i->nr_segs = nr_segs; i->iov_offset = 0; i->count = count; @@ -727,8 +724,8 @@ struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) return dst; } -struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, - struct flowi6 *fl6, const struct in6_addr *final_dst) +struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, + const struct in6_addr *final_dst) { struct rtable *route; @@ -739,7 +736,7 @@ struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, FAIL(" malloc failed"); return ERR_PTR(-ENOMEM); } - atomic_set(&route->dst.__rcuref.refcnt, 1); + atomic_set(&route->dst.__refcnt, 1); route->dst.ops = &mock_dst_ops; route->dst.dev = &mock_devices[0]; route->dst.obsolete = 0; @@ -755,7 +752,7 @@ unsigned int ip6_mtu(const struct dst_entry *dst) } int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, - u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) + u32 mark, struct ipv6_txoptions *opt, int tclass) { char buffer[200]; const char *prefix = " "; @@ -830,7 +827,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, FAIL(" malloc failed"); return ERR_PTR(-ENOMEM); } - atomic_set(&route->dst.__rcuref.refcnt, 1); + atomic_set(&route->dst.__refcnt, 1); route->dst.ops = &mock_dst_ops; route->dst.dev = &mock_devices[0]; route->dst.obsolete = 0; @@ -849,20 +846,22 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, void device_set_wakeup_capable(struct device *dev, bool capable) {} -void device_wakeup_disable(struct device *dev) -{} +int device_wakeup_disable(struct device *dev) +{ + return 0; +} int device_wakeup_enable(struct device *dev) { return 0; } -int filp_close(struct file *, fl_owner_t id) +int filp_close(struct file *f, fl_owner_t id) { return 0; } -struct file *filp_open(const char *, int, umode_t) +struct file *filp_open(const char *f, int dummy, umode_t mode) { return NULL; } @@ -875,6 +874,11 @@ void __fortify_panic(const u8 reason, const size_t avail, const size_t size) while (1) ; } +void free_bucket_spinlocks(spinlock_t *locks) +{ + kvfree(locks); +} + ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { return 0; @@ -899,11 +903,7 @@ void kfree(const void *block) free((void *) block); } -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) -void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) -#else -void __kfree_skb(struct sk_buff *skb) -#endif +void kfree_skb(struct sk_buff *skb) { int i; struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -929,19 +929,9 @@ void __kfree_skb(struct sk_buff *skb) free(skb); } -void kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) -{ - while (segs) { - struct sk_buff *next = segs->next; - - __kfree_skb(segs); - segs = next; - } -} - void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) { - return mock_kmalloc(size, gfpflags); + return kmalloc(size, gfpflags); } #ifdef CONFIG_DEBUG_ATOMIC_SLEEP @@ -951,7 +941,7 @@ void __might_sleep(const char *file, int line) } #endif -void *mock_kmalloc(size_t size, gfp_t flags) +void *__kmalloc(size_t size, gfp_t flags) { void *block; @@ -977,17 +967,12 @@ void *mock_kmalloc(size_t size, gfp_t flags) void *__kmalloc_noprof(size_t size, gfp_t flags) { - return mock_kmalloc(size, flags); + return kmalloc(size, flags); } -void kvfree(const void *addr) +void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) { - kfree(addr); -} - -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) -{ - return mock_kmalloc(size, flags); + return __kmalloc(size, flags); } struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), @@ -1011,6 +996,16 @@ int kthread_stop(struct task_struct *k) return 0; } +void kvfree(const void *addr) +{ + kfree(addr); +} + +void *kvmalloc_node(size_t size, gfp_t flags, int node) +{ + return __kmalloc(size, flags); +} + #ifdef CONFIG_DEBUG_LIST bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) @@ -1064,9 +1059,8 @@ void lock_sock_nested(struct sock *sk, int subclass) mock_active_locks++; sk->sk_lock.owned = 1; } - -ssize_t __modver_version_show(const struct module_attribute *a, - struct module_kobject *b, char *c) +ssize_t __modver_version_show(struct module_attribute *mattr, + struct module_kobject *mk, char *buf) { return 0; } @@ -1108,22 +1102,6 @@ int netif_receive_skb(struct sk_buff *skb) void __netif_schedule(struct Qdisc *q) {} -void preempt_count_add(int val) -{ - int i; - - for (i = 0; i < val; i++) - preempt_disable(); -} - -void preempt_count_sub(int val) -{ - int i; - - for (i = 0; i < val; i++) - preempt_enable(); -} - long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { @@ -1133,7 +1111,7 @@ long prepare_to_wait_event(struct wait_queue_head *wq_head, return 0; } -int _printk(const char *format, ...) +int printk(const char *format, ...) { int len = strlen(mock_printk_output); int available; @@ -1166,7 +1144,7 @@ int _printk(const char *format, ...) struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, - const struct proc_ops *proc_ops) + const struct file_operations *proc_fops) { struct proc_dir_entry *entry = malloc(40); @@ -1264,23 +1242,12 @@ int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) return 1; } -void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) -{ - UNIT_HOOK("unlock"); - mock_record_unlocked(lock); -} - void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) { UNIT_HOOK("unlock"); mock_record_unlocked(lock); } -void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) -{ - mock_record_unlocked(lock); -} - void _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { @@ -1321,20 +1288,7 @@ int rcu_read_lock_bh_held(void) } #endif -void __rcu_read_lock(void) -{} - -void __rcu_read_unlock(void) -{} - -bool rcuref_get_slowpath(rcuref_t *ref) -{ - return true; -} - -void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t) {} - -int register_pernet_subsys(struct pernet_operations *) +int register_pernet_subsys(struct pernet_operations *ops) { return 0; } @@ -1370,7 +1324,7 @@ void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) while (true) { struct sk_buff *next = head->next; - __kfree_skb(head); + kfree_skb(head); if (head == tail) break; head = next; @@ -1403,8 +1357,7 @@ int __SCT__might_resched(void) void __SCT__preempt_schedule(void) {} -void security_sk_classify_flow(const struct sock *sk, - struct flowi_common *flic) +void security_sk_classify_flow(struct sock *sk, struct flowi *fl) {} void __show_free_areas(unsigned int filter, nodemask_t *nodemask, @@ -1419,16 +1372,6 @@ int sk_set_peek_off(struct sock *sk, int val) return 0; } -void sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, - enum skb_drop_reason reason) -{ -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) - kfree_skb(skb); -#else - __kfree_skb(skb); -#endif -} - int skb_copy_datagram_iter(const struct sk_buff *from, int offset, struct iov_iter *iter, int size) { @@ -1442,7 +1385,7 @@ int skb_copy_datagram_iter(const struct sk_buff *from, int offset, return 0; } while (bytes_left > 0) { - struct iovec *iov = (struct iovec *) iter_iov(iter); + struct iovec *iov = (struct iovec *)iter->iov; u64 int_base = (u64) iov->iov_base; size_t chunk_bytes = iov->iov_len; @@ -1458,7 +1401,7 @@ int skb_copy_datagram_iter(const struct sk_buff *from, int offset, iov->iov_base = (void *) (int_base + chunk_bytes); iov->iov_len -= chunk_bytes; if (iov->iov_len == 0) - iter->__iov++; + iter->iov++; } return 0; } @@ -1497,13 +1440,6 @@ void *skb_put(struct sk_buff *skb, unsigned int len) return result; } -void skb_queue_purge_reason(struct sk_buff_head *list, - enum skb_drop_reason reason) -{ - while (skb_queue_len(list) > 0) - kfree_skb(__skb_dequeue(list)); -} - struct sk_buff *skb_segment(struct sk_buff *head_skb, netdev_features_t features) { @@ -1533,13 +1469,13 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname, } int sock_common_setsockopt(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int optlen) + char __user *optval, unsigned int optlen) { return 0; } int sock_no_accept(struct socket *sock, struct socket *newsock, - struct proto_accept_arg *arg) + int dummy1, bool dummy2) { return 0; } @@ -1571,7 +1507,7 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2) return 0; } -void synchronize_rcu(void) +void synchronize_sched(void) {} void __tasklet_hi_schedule(struct tasklet_struct *t) @@ -1592,13 +1528,14 @@ void unregister_net_sysctl_table(struct ctl_table_header *header) UNIT_LOG("; ", "unregister_net_sysctl_table"); } -void unregister_pernet_subsys(struct pernet_operations *) +void unregister_pernet_subsys(struct pernet_operations *ops) {} -void unregister_qdisc(struct Qdisc_ops *qops) +int unregister_qdisc(struct Qdisc_ops *qops) { registered_qdiscs--; qdisc_ops = NULL; + return 0; } void vfree(const void *block) @@ -1624,16 +1561,14 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned int mode, return 0; } -int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, - int nr_exclusive, void *key) +void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key) { if (!mock_log_wakeups) - return 0; - if (nr_exclusive == 1) + return; + if (nr == 1) unit_log_printf("; ", "wake_up"); else unit_log_printf("; ", "wake_up_all"); - return 0; } void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, @@ -2340,7 +2275,7 @@ int mock_sock_init(struct homa_sock *hsk, struct homa_net *hnet, int port) mock_socket.sk = sk; sk->sk_net.net = mock_net_for_hnet(hnet); refcount_set(&sk->sk_wmem_alloc, 1); - init_waitqueue_head(&mock_socket.wq.wait); + init_waitqueue_head(&mock_socket.wq->wait); rcu_assign_pointer(sk->sk_wq, &mock_socket.wq); sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; if (port != 0 && port >= mock_min_default_port) diff --git a/test/mock.h b/test/mock.h index c071e77b..3f650c80 100644 --- a/test/mock.h +++ b/test/mock.h @@ -10,10 +10,13 @@ #undef alloc_pages #define alloc_pages mock_alloc_pages +#undef atomic64_cmpxchg_relaxed #define atomic64_cmpxchg_relaxed mock_cmpxchg #undef alloc_percpu_gfp -#define alloc_percpu_gfp(type, flags) mock_kmalloc(10 * sizeof(type), flags) +#define alloc_percpu_gfp(type, flags) __kmalloc(10 * sizeof(type), flags) + +#define complete_and_exit(...) #define compound_order mock_compound_order @@ -46,13 +49,8 @@ #define homa_rpc_put mock_rpc_put -#undef kmalloc -#define kmalloc mock_kmalloc - #undef kmalloc_array -#define kmalloc_array(count, size, type) mock_kmalloc((count) * (size), type) - -#define kthread_complete_and_exit(...) +#define kmalloc_array(count, size, type) __kmalloc((count) * (size), type) #undef local_irq_save #define local_irq_save(flags) (flags) = 0 @@ -93,6 +91,8 @@ #undef register_net_sysctl #define register_net_sysctl mock_register_net_sysctl +#define rt6_get_cookie(...) 999 + #define signal_pending(...) mock_signal_pending #undef smp_processor_id @@ -205,7 +205,6 @@ unsigned int void mock_get_page(struct page *page); struct homa_net *mock_hnet(int index, struct homa *homa); -void *mock_kmalloc(size_t size, gfp_t flags); struct net *mock_net_for_hnet(struct homa_net *hnet); void *mock_net_generic(const struct net *net, unsigned int id); int mock_page_refs(struct page *page); diff --git a/test/rbtree.c b/test/rbtree.c index 9e730718..d3ff682f 100644 --- a/test/rbtree.c +++ b/test/rbtree.c @@ -1,10 +1,22 @@ -// SPDX-License-Identifier: GPL-2.0-or-later /* Red Black Trees (C) 1999 Andrea Arcangeli (C) 2002 David Woodhouse (C) 2012 Michel Lespinasse + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA linux/lib/rbtree.c */ @@ -13,7 +25,7 @@ #include /* - * red-black trees properties: https://en.wikipedia.org/wiki/Rbtree + * red-black trees properties: http://en.wikipedia.org/wiki/Rbtree * * 1) A node is either red or black * 2) The root is black @@ -58,7 +70,7 @@ static inline void rb_set_black(struct rb_node *rb) { - rb->__rb_parent_color += RB_BLACK; + rb->__rb_parent_color |= RB_BLACK; } static inline struct rb_node *rb_red_parent(struct rb_node *red) @@ -83,10 +95,14 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, static __always_inline void __rb_insert(struct rb_node *node, struct rb_root *root, + bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; + if (newleft) + *leftmost = node; + while (true) { /* * Loop invariant: node is red. @@ -412,6 +428,7 @@ void __rb_erase_color(struct rb_node *parent, struct rb_root *root, { ____rb_erase_color(parent, root, augment_rotate); } +EXPORT_SYMBOL(__rb_erase_color); /* * Non-augmented rbtree manipulation functions. @@ -432,16 +449,37 @@ static const struct rb_augment_callbacks dummy_callbacks = { void rb_insert_color(struct rb_node *node, struct rb_root *root) { - __rb_insert(node, root, dummy_rotate); + __rb_insert(node, root, false, NULL, dummy_rotate); } +EXPORT_SYMBOL(rb_insert_color); void rb_erase(struct rb_node *node, struct rb_root *root) { struct rb_node *rebalance; - rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); + rebalance = __rb_erase_augmented(node, root, + NULL, &dummy_callbacks); if (rebalance) ____rb_erase_color(rebalance, root, dummy_rotate); } +EXPORT_SYMBOL(rb_erase); + +void rb_insert_color_cached(struct rb_node *node, + struct rb_root_cached *root, bool leftmost) +{ + __rb_insert(node, &root->rb_root, leftmost, + &root->rb_leftmost, dummy_rotate); +} +EXPORT_SYMBOL(rb_insert_color_cached); + +void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) +{ + struct rb_node *rebalance; + rebalance = __rb_erase_augmented(node, &root->rb_root, + &root->rb_leftmost, &dummy_callbacks); + if (rebalance) + ____rb_erase_color(rebalance, &root->rb_root, dummy_rotate); +} +EXPORT_SYMBOL(rb_erase_cached); /* * Augmented rbtree manipulation functions. @@ -451,10 +489,12 @@ void rb_erase(struct rb_node *node, struct rb_root *root) */ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, + bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { - __rb_insert(node, root, augment_rotate); + __rb_insert(node, root, newleft, leftmost, augment_rotate); } +EXPORT_SYMBOL(__rb_insert_augmented); /* * This function returns the first node (in sort order) of the tree. @@ -470,6 +510,7 @@ struct rb_node *rb_first(const struct rb_root *root) n = n->rb_left; return n; } +EXPORT_SYMBOL(rb_first); struct rb_node *rb_last(const struct rb_root *root) { @@ -482,6 +523,7 @@ struct rb_node *rb_last(const struct rb_root *root) n = n->rb_right; return n; } +EXPORT_SYMBOL(rb_last); struct rb_node *rb_next(const struct rb_node *node) { @@ -497,7 +539,7 @@ struct rb_node *rb_next(const struct rb_node *node) if (node->rb_right) { node = node->rb_right; while (node->rb_left) - node = node->rb_left; + node=node->rb_left; return (struct rb_node *)node; } @@ -513,6 +555,7 @@ struct rb_node *rb_next(const struct rb_node *node) return parent; } +EXPORT_SYMBOL(rb_next); struct rb_node *rb_prev(const struct rb_node *node) { @@ -528,7 +571,7 @@ struct rb_node *rb_prev(const struct rb_node *node) if (node->rb_left) { node = node->rb_left; while (node->rb_right) - node = node->rb_right; + node=node->rb_right; return (struct rb_node *)node; } @@ -541,6 +584,7 @@ struct rb_node *rb_prev(const struct rb_node *node) return parent; } +EXPORT_SYMBOL(rb_prev); void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root) @@ -557,6 +601,39 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, rb_set_parent(victim->rb_right, new); __rb_change_child(victim, new, parent, root); } +EXPORT_SYMBOL(rb_replace_node); + +void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, + struct rb_root_cached *root) +{ + rb_replace_node(victim, new, &root->rb_root); + + if (root->rb_leftmost == victim) + root->rb_leftmost = new; +} +EXPORT_SYMBOL(rb_replace_node_cached); + +void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, + struct rb_root *root) +{ + struct rb_node *parent = rb_parent(victim); + + /* Copy the pointers/colour from the victim to the replacement */ + *new = *victim; + + /* Set the surrounding nodes to point to the replacement */ + if (victim->rb_left) + rb_set_parent(victim->rb_left, new); + if (victim->rb_right) + rb_set_parent(victim->rb_right, new); + + /* Set the parent's pointer to the new node last after an RCU barrier + * so that the pointers onwards are seen to be set correctly when doing + * an RCU walk over the tree. + */ + __rb_change_child_rcu(victim, new, parent, root); +} +EXPORT_SYMBOL(rb_replace_node_rcu); static struct rb_node *rb_left_deepest_node(const struct rb_node *node) { @@ -587,6 +664,7 @@ struct rb_node *rb_next_postorder(const struct rb_node *node) * should be next */ return (struct rb_node *)parent; } +EXPORT_SYMBOL(rb_next_postorder); struct rb_node *rb_first_postorder(const struct rb_root *root) { @@ -595,3 +673,4 @@ struct rb_node *rb_first_postorder(const struct rb_root *root) return rb_left_deepest_node(root->rb_node); } +EXPORT_SYMBOL(rb_first_postorder); diff --git a/test/rhashtable.c b/test/rhashtable.c index 3e555d01..a9cbe019 100644 --- a/test/rhashtable.c +++ b/test/rhashtable.c @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: GPL-2.0-only /* * Resizable, Scalable, Concurrent Hash Table * @@ -9,6 +8,10 @@ * Code partially derived from nft_hash * Rewritten with rehash code from br_multicast plus single list * pointer as suggested by Josh Triplett + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ #include @@ -26,12 +29,16 @@ #include #include +void mock_spin_unlock(spinlock_t *lock); +#define spin_unlock mock_spin_unlock + #define HASH_DEFAULT_SIZE 64UL #define HASH_MIN_SIZE 4U +#define BUCKET_LOCKS_PER_CPU 32UL union nested_table { union nested_table __rcu *table; - struct rhash_lock_head __rcu *bucket; + struct rhash_head __rcu *bucket; }; static u32 head_hashfn(struct rhashtable *ht, @@ -52,33 +59,22 @@ EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) { - if (!debug_locks) - return 1; - if (unlikely(tbl->nest)) - return 1; - return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]); + spinlock_t *lock = rht_bucket_lock(tbl, hash); + + return (debug_locks) ? lockdep_is_held(lock) : 1; } EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); #else #define ASSERT_RHT_MUTEX(HT) #endif -static inline union nested_table *nested_table_top( - const struct bucket_table *tbl) -{ - /* The top-level bucket entry does not need RCU protection - * because it's set at the same time as tbl->nest. - */ - return (void *)rcu_dereference_protected(tbl->buckets[0], 1); -} - static void nested_table_free(union nested_table *ntbl, unsigned int size) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); const unsigned int len = 1 << shift; unsigned int i; - ntbl = rcu_dereference_protected(ntbl->table, 1); + ntbl = rcu_dereference_raw(ntbl->table); if (!ntbl) return; @@ -98,7 +94,7 @@ static void nested_bucket_table_free(const struct bucket_table *tbl) union nested_table *ntbl; unsigned int i; - ntbl = nested_table_top(tbl); + ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); for (i = 0; i < len; i++) nested_table_free(ntbl + i, size); @@ -111,6 +107,7 @@ static void bucket_table_free(const struct bucket_table *tbl) if (tbl->nest) nested_bucket_table_free(tbl); + free_bucket_spinlocks(tbl->locks); kvfree(tbl); } @@ -121,7 +118,8 @@ static void bucket_table_free_rcu(struct rcu_head *head) static union nested_table *nested_table_alloc(struct rhashtable *ht, union nested_table __rcu **prev, - bool leaf) + unsigned int shifted, + unsigned int nhash) { union nested_table *ntbl; int i; @@ -130,19 +128,17 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht, if (ntbl) return ntbl; - ntbl = alloc_hooks_tag(ht->alloc_tag, - kmalloc_noprof(PAGE_SIZE, GFP_ATOMIC|__GFP_ZERO)); + ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC); - if (ntbl && leaf) { - for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++) - INIT_RHT_NULLS_HEAD(ntbl[i].bucket); + if (ntbl && shifted) { + for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0].bucket); i++) + INIT_RHT_NULLS_HEAD(ntbl[i].bucket, ht, + (i << shifted) | nhash); } - if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL) - return ntbl; - /* Raced with another thread. */ - kfree(ntbl); - return rcu_dereference(*prev); + rcu_assign_pointer(*prev, ntbl); + + return ntbl; } static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, @@ -158,13 +154,12 @@ static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, size = sizeof(*tbl) + sizeof(tbl->buckets[0]); - tbl = alloc_hooks_tag(ht->alloc_tag, - kmalloc_noprof(size, gfp|__GFP_ZERO)); + tbl = kzalloc(size, gfp); if (!tbl) return NULL; if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, - false)) { + 0, 0)) { kfree(tbl); return NULL; } @@ -179,35 +174,42 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, gfp_t gfp) { struct bucket_table *tbl = NULL; - size_t size; + size_t size, max_locks; int i; - static struct lock_class_key __key; - tbl = alloc_hooks_tag(ht->alloc_tag, - kvmalloc_node_noprof(struct_size(tbl, buckets, nbuckets), - gfp|__GFP_ZERO, NUMA_NO_NODE)); + size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); + if (gfp != GFP_KERNEL) + tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); + else + tbl = kvzalloc(size, gfp); size = nbuckets; - if (tbl == NULL && !gfpflags_allow_blocking(gfp)) { + if (tbl == NULL && gfp != GFP_KERNEL) { tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); nbuckets = 0; } - if (tbl == NULL) return NULL; - lockdep_init_map(&tbl->dep_map, "rhashtable_bucket", &__key, 0); - tbl->size = size; - rcu_head_init(&tbl->rcu); + max_locks = size >> 1; + if (tbl->nest) + max_locks = min_t(size_t, max_locks, 1U << tbl->nest); + + if (alloc_bucket_spinlocks(&tbl->locks, &tbl->locks_mask, max_locks, + ht->p.locks_mul, gfp) < 0) { + bucket_table_free(tbl); + return NULL; + } + INIT_LIST_HEAD(&tbl->walkers); tbl->hash_rnd = get_random_u32(); for (i = 0; i < nbuckets; i++) - INIT_RHT_NULLS_HEAD(tbl->buckets[i]); + INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i); return tbl; } @@ -225,25 +227,23 @@ static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, return new_tbl; } -static int rhashtable_rehash_one(struct rhashtable *ht, - struct rhash_lock_head __rcu **bkt, - unsigned int old_hash) +static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); - struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl); + struct bucket_table *new_tbl = rhashtable_last_table(ht, + rht_dereference_rcu(old_tbl->future_tbl, ht)); + struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash); int err = -EAGAIN; struct rhash_head *head, *next, *entry; - struct rhash_head __rcu **pprev = NULL; + spinlock_t *new_bucket_lock; unsigned int new_hash; - unsigned long flags; if (new_tbl->nest) goto out; err = -ENOENT; - rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash), - old_tbl, old_hash) { + rht_for_each(entry, old_tbl, old_hash) { err = 0; next = rht_dereference_bucket(entry->next, old_tbl, old_hash); @@ -258,20 +258,18 @@ static int rhashtable_rehash_one(struct rhashtable *ht, new_hash = head_hashfn(ht, new_tbl, entry); - flags = rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash], - SINGLE_DEPTH_NESTING); + new_bucket_lock = rht_bucket_lock(new_tbl, new_hash); - head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash); + spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING); + head = rht_dereference_bucket(new_tbl->buckets[new_hash], + new_tbl, new_hash); RCU_INIT_POINTER(entry->next, head); - rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry, flags); + rcu_assign_pointer(new_tbl->buckets[new_hash], entry); + spin_unlock(new_bucket_lock); - if (pprev) - rcu_assign_pointer(*pprev, next); - else - /* Need to preserved the bit lock. */ - rht_assign_locked(bkt, next); + rcu_assign_pointer(*pprev, next); out: return err; @@ -281,20 +279,20 @@ static int rhashtable_rehash_chain(struct rhashtable *ht, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); - struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash); - unsigned long flags; + spinlock_t *old_bucket_lock; int err; - if (!bkt) - return 0; - flags = rht_lock(old_tbl, bkt); + old_bucket_lock = rht_bucket_lock(old_tbl, old_hash); - while (!(err = rhashtable_rehash_one(ht, bkt, old_hash))) + spin_lock_bh(old_bucket_lock); + while (!(err = rhashtable_rehash_one(ht, old_hash))) ; - if (err == -ENOENT) + if (err == -ENOENT) { + old_tbl->rehash++; err = 0; - rht_unlock(old_tbl, bkt, flags); + } + spin_unlock_bh(old_bucket_lock); return err; } @@ -303,15 +301,21 @@ static int rhashtable_rehash_attach(struct rhashtable *ht, struct bucket_table *old_tbl, struct bucket_table *new_tbl) { + /* Protect future_tbl using the first bucket lock. */ + spin_lock_bh(old_tbl->locks); + + /* Did somebody beat us to it? */ + if (rcu_access_pointer(old_tbl->future_tbl)) { + spin_unlock_bh(old_tbl->locks); + return -EEXIST; + } + /* Make insertions go into the new, empty table right away. Deletions * and lookups will be attempted in both tables until we synchronize. - * As cmpxchg() provides strong barriers, we do not need - * rcu_assign_pointer(). */ + rcu_assign_pointer(old_tbl->future_tbl, new_tbl); - if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL, - new_tbl) != NULL) - return -EEXIST; + spin_unlock_bh(old_tbl->locks); return 0; } @@ -341,16 +345,13 @@ static int rhashtable_rehash_table(struct rhashtable *ht) spin_lock(&ht->lock); list_for_each_entry(walker, &old_tbl->walkers, list) walker->tbl = NULL; + spin_unlock(&ht->lock); /* Wait for readers. All new readers will see the new * table, and thus no references to the old table will * remain. - * We do this inside the locked region so that - * rhashtable_walk_stop() can use rcu_head_after_call_rcu() - * to check if it should not re-link the table. */ call_rcu(&old_tbl->rcu, bucket_table_free_rcu); - spin_unlock(&ht->lock); return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; } @@ -430,12 +431,8 @@ static void rht_deferred_worker(struct work_struct *work) else if (tbl->nest) err = rhashtable_rehash_alloc(ht, tbl, tbl->size); - if (!err || err == -EEXIST) { - int nerr; - - nerr = rhashtable_rehash_table(ht); - err = err ?: nerr; - } + if (!err) + err = rhashtable_rehash_table(ht); mutex_unlock(&ht->mutex); @@ -465,7 +462,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht, err = -ENOMEM; - new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN); + new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC); if (new_tbl == NULL) goto fail; @@ -481,7 +478,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht, fail: /* Do not fail the insert if someone else did a rehash. */ - if (likely(rcu_access_pointer(tbl->future_tbl))) + if (likely(rcu_dereference_raw(tbl->future_tbl))) return 0; /* Schedule async rehash to retry allocation in process context. */ @@ -492,7 +489,6 @@ static int rhashtable_insert_rehash(struct rhashtable *ht, } static void *rhashtable_lookup_one(struct rhashtable *ht, - struct rhash_lock_head __rcu **bkt, struct bucket_table *tbl, unsigned int hash, const void *key, struct rhash_head *obj) { @@ -500,12 +496,13 @@ static void *rhashtable_lookup_one(struct rhashtable *ht, .ht = ht, .key = key, }; - struct rhash_head __rcu **pprev = NULL; + struct rhash_head __rcu **pprev; struct rhash_head *head; int elasticity; elasticity = RHT_ELASTICITY; - rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { + pprev = rht_bucket_var(tbl, hash); + rht_for_each_continue(head, *pprev, tbl, hash) { struct rhlist_head *list; struct rhlist_head *plist; @@ -527,11 +524,7 @@ static void *rhashtable_lookup_one(struct rhashtable *ht, RCU_INIT_POINTER(list->next, plist); head = rht_dereference_bucket(head->next, tbl, hash); RCU_INIT_POINTER(list->rhead.next, head); - if (pprev) - rcu_assign_pointer(*pprev, obj); - else - /* Need to preserve the bit lock */ - rht_assign_locked(bkt, obj); + rcu_assign_pointer(*pprev, obj); return NULL; } @@ -542,11 +535,13 @@ static void *rhashtable_lookup_one(struct rhashtable *ht, return ERR_PTR(-ENOENT); } -static struct bucket_table *rhashtable_insert_one( - struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, - struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj, - void *data) +static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, + struct bucket_table *tbl, + unsigned int hash, + struct rhash_head *obj, + void *data) { + struct rhash_head __rcu **pprev; struct bucket_table *new_tbl; struct rhash_head *head; @@ -556,7 +551,7 @@ static struct bucket_table *rhashtable_insert_one( if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) return ERR_CAST(data); - new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); + new_tbl = rcu_dereference(tbl->future_tbl); if (new_tbl) return new_tbl; @@ -569,7 +564,11 @@ static struct bucket_table *rhashtable_insert_one( if (unlikely(rht_grow_above_100(ht, tbl))) return ERR_PTR(-EAGAIN); - head = rht_ptr(bkt, tbl, hash); + pprev = rht_bucket_insert(ht, tbl, hash); + if (!pprev) + return ERR_PTR(-ENOMEM); + + head = rht_dereference_bucket(*pprev, tbl, hash); RCU_INIT_POINTER(obj->next, head); if (ht->rhlist) { @@ -579,10 +578,11 @@ static struct bucket_table *rhashtable_insert_one( RCU_INIT_POINTER(list->next, NULL); } - /* bkt is always the head of the list, so it holds - * the lock, which we need to preserve - */ - rht_assign_locked(bkt, obj); + rcu_assign_pointer(*pprev, obj); + + atomic_inc(&ht->nelems); + if (rht_grow_above_75(ht, tbl)) + schedule_work(&ht->run_work); return NULL; } @@ -592,44 +592,47 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, { struct bucket_table *new_tbl; struct bucket_table *tbl; - struct rhash_lock_head __rcu **bkt; - unsigned long flags; unsigned int hash; + spinlock_t *lock; void *data; - new_tbl = rcu_dereference(ht->tbl); + tbl = rcu_dereference(ht->tbl); - do { + /* All insertions must grab the oldest table containing + * the hashed bucket that is yet to be rehashed. + */ + for (;;) { + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + lock = rht_bucket_lock(tbl, hash); + spin_lock_bh(lock); + + if (tbl->rehash <= hash) + break; + + spin_unlock_bh(lock); + tbl = rcu_dereference(tbl->future_tbl); + } + + data = rhashtable_lookup_one(ht, tbl, hash, key, obj); + new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + while (!IS_ERR_OR_NULL(new_tbl)) { tbl = new_tbl; hash = rht_head_hashfn(ht, tbl, obj, ht->p); - if (rcu_access_pointer(tbl->future_tbl)) - /* Failure is OK */ - bkt = rht_bucket_var(tbl, hash); - else - bkt = rht_bucket_insert(ht, tbl, hash); - if (bkt == NULL) { - new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); - data = ERR_PTR(-EAGAIN); - } else { - bool inserted; - - flags = rht_lock(tbl, bkt); - data = rhashtable_lookup_one(ht, bkt, tbl, - hash, key, obj); - new_tbl = rhashtable_insert_one(ht, bkt, tbl, - hash, obj, data); - inserted = data && !new_tbl; - if (inserted) - atomic_inc(&ht->nelems); - if (PTR_ERR(new_tbl) != -EEXIST) - data = ERR_CAST(new_tbl); - - rht_unlock(tbl, bkt, flags); - - if (inserted && rht_grow_above_75(ht, tbl)) - schedule_work(&ht->run_work); - } - } while (!IS_ERR_OR_NULL(new_tbl)); + spin_lock_nested(rht_bucket_lock(tbl, hash), + SINGLE_DEPTH_NESTING); + + data = rhashtable_lookup_one(ht, tbl, hash, key, obj); + new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + spin_unlock(rht_bucket_lock(tbl, hash)); + } + + spin_unlock_bh(lock); if (PTR_ERR(data) == -EAGAIN) data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: @@ -669,7 +672,7 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow); * structure outside the hash table. * * This function may be called from any process context, including - * non-preemptible context, but cannot be called from softirq or + * non-preemptable context, but cannot be called from softirq or * hardirq context. * * You must call rhashtable_walk_exit after this function returns. @@ -694,7 +697,7 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_enter); * rhashtable_walk_exit - Free an iterator * @iter: Hash table Iterator * - * This function frees resources allocated by rhashtable_walk_enter. + * This function frees resources allocated by rhashtable_walk_init. */ void rhashtable_walk_exit(struct rhashtable_iter *iter) { @@ -715,7 +718,7 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_exit); * * Returns zero if successful. * - * Returns -EAGAIN if resize event occurred. Note that the iterator + * Returns -EAGAIN if resize event occured. Note that the iterator * will rewind back to the beginning and you may use it immediately * by calling rhashtable_walk_next. * @@ -951,11 +954,10 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter) ht = iter->ht; spin_lock(&ht->lock); - if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu)) - /* This bucket table is being freed, don't re-link it. */ - iter->walker.tbl = NULL; - else + if (tbl->rehash < tbl->size) list_add(&iter->walker.list, &tbl->walkers); + else + iter->walker.tbl = NULL; spin_unlock(&ht->lock); out: @@ -1003,6 +1005,7 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) * .key_offset = offsetof(struct test_obj, key), * .key_len = sizeof(int), * .hashfn = jhash, + * .nulls_base = (1U << RHT_BASE_SHIFT), * }; * * Configuration Example 2: Variable length keys @@ -1024,7 +1027,7 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) * .obj_hashfn = my_hash_fn, * }; */ -int rhashtable_init_noprof(struct rhashtable *ht, +int rhashtable_init(struct rhashtable *ht, const struct rhashtable_params *params) { struct bucket_table *tbl; @@ -1034,13 +1037,14 @@ int rhashtable_init_noprof(struct rhashtable *ht, (params->obj_hashfn && !params->obj_cmpfn)) return -EINVAL; + if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT)) + return -EINVAL; + memset(ht, 0, sizeof(*ht)); mutex_init(&ht->mutex); spin_lock_init(&ht->lock); memcpy(&ht->p, params, sizeof(*params)); - alloc_tag_record(ht->alloc_tag); - if (params->min_size) ht->p.min_size = roundup_pow_of_two(params->min_size); @@ -1057,6 +1061,11 @@ int rhashtable_init_noprof(struct rhashtable *ht, size = rounded_hashtable_size(&ht->p); + if (params->locks_mul) + ht->p.locks_mul = roundup_pow_of_two(params->locks_mul); + else + ht->p.locks_mul = BUCKET_LOCKS_PER_CPU; + ht->key_len = ht->p.key_len; if (!params->hashfn) { ht->p.hashfn = jhash; @@ -1067,16 +1076,9 @@ int rhashtable_init_noprof(struct rhashtable *ht, } } - /* - * This is api initialization and thus we need to guarantee the - * initial rhashtable allocation. Upon failure, retry with the - * smallest possible size with __GFP_NOFAIL semantics. - */ tbl = bucket_table_alloc(ht, size, GFP_KERNEL); - if (unlikely(tbl == NULL)) { - size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); - tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL); - } + if (tbl == NULL) + return -ENOMEM; atomic_set(&ht->nelems, 0); @@ -1086,7 +1088,7 @@ int rhashtable_init_noprof(struct rhashtable *ht, return 0; } -EXPORT_SYMBOL_GPL(rhashtable_init_noprof); +EXPORT_SYMBOL_GPL(rhashtable_init); /** * rhltable_init - initialize a new hash list table @@ -1097,15 +1099,19 @@ EXPORT_SYMBOL_GPL(rhashtable_init_noprof); * * See documentation for rhashtable_init. */ -int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params) +int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params) { int err; - err = rhashtable_init_noprof(&hlt->ht, params); + /* No rhlist NULLs marking for now. */ + if (params->nulls_base) + return -EINVAL; + + err = rhashtable_init(&hlt->ht, params); hlt->ht.rhlist = true; return err; } -EXPORT_SYMBOL_GPL(rhltable_init_noprof); +EXPORT_SYMBOL_GPL(rhltable_init); static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, void (*free_fn)(void *ptr, void *arg), @@ -1158,7 +1164,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, struct rhash_head *pos, *next; cond_resched(); - for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)), + for (pos = rht_dereference(*rht_bucket(tbl, i), ht), next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL; !rht_is_a_nulls(pos); @@ -1185,16 +1191,18 @@ void rhashtable_destroy(struct rhashtable *ht) } EXPORT_SYMBOL_GPL(rhashtable_destroy); -struct rhash_lock_head __rcu **__rht_bucket_nested( - const struct bucket_table *tbl, unsigned int hash) +struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl, + unsigned int hash) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + static struct rhash_head __rcu *rhnull = + (struct rhash_head __rcu *)NULLS_MARKER(0); unsigned int index = hash & ((1 << tbl->nest) - 1); unsigned int size = tbl->size >> tbl->nest; unsigned int subhash = hash; union nested_table *ntbl; - ntbl = nested_table_top(tbl); + ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); subhash >>= tbl->nest; @@ -1207,43 +1215,40 @@ struct rhash_lock_head __rcu **__rht_bucket_nested( } if (!ntbl) - return NULL; + return &rhnull; return &ntbl[subhash].bucket; } -EXPORT_SYMBOL_GPL(__rht_bucket_nested); - -struct rhash_lock_head __rcu **rht_bucket_nested( - const struct bucket_table *tbl, unsigned int hash) -{ - static struct rhash_lock_head __rcu *rhnull; - - if (!rhnull) - INIT_RHT_NULLS_HEAD(rhnull); - return __rht_bucket_nested(tbl, hash) ?: &rhnull; -} EXPORT_SYMBOL_GPL(rht_bucket_nested); -struct rhash_lock_head __rcu **rht_bucket_nested_insert( - struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) +struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, + struct bucket_table *tbl, + unsigned int hash) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); unsigned int index = hash & ((1 << tbl->nest) - 1); unsigned int size = tbl->size >> tbl->nest; union nested_table *ntbl; + unsigned int shifted; + unsigned int nhash; - ntbl = nested_table_top(tbl); + ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); hash >>= tbl->nest; + nhash = index; + shifted = tbl->nest; ntbl = nested_table_alloc(ht, &ntbl[index].table, - size <= (1 << shift)); + size <= (1 << shift) ? shifted : 0, nhash); while (ntbl && size > (1 << shift)) { index = hash & ((1 << shift) - 1); size >>= shift; hash >>= shift; + nhash |= index << shifted; + shifted += shift; ntbl = nested_table_alloc(ht, &ntbl[index].table, - size <= (1 << shift)); + size <= (1 << shift) ? shifted : 0, + nhash); } if (!ntbl) diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 4ce7e5e0..2e928b1c 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -894,23 +894,6 @@ TEST_F(homa_incoming, homa_copy_to_user__skb_data_extends_past_message_end) homa_rpc_unlock(crpc); EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_incoming, homa_copy_to_user__error_in_import_ubuf) -{ - struct homa_rpc *crpc; - - crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->server_port, self->client_id, - 1000, 4000); - ASSERT_NE(NULL, crpc); - - unit_log_clear(); - mock_import_ubuf_errors = 1; - homa_rpc_lock(crpc); - EXPECT_EQ(13, -homa_copy_to_user(crpc)); - homa_rpc_unlock(crpc); - EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); -} TEST_F(homa_incoming, homa_copy_to_user__error_in_skb_copy_datagram_iter) { struct homa_rpc *crpc; diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index c64dd7a0..672206ef 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -11,13 +11,13 @@ #define cur_offload_core (&per_cpu(homa_offload_core, smp_processor_id())) -static struct sk_buff *test_tcp_gro_receive(struct list_head *held_list, +static struct sk_buff **test_tcp_gro_receive(struct sk_buff **gro_list, struct sk_buff *skb) { UNIT_LOG("; ", "test_tcp_gro_receive"); return NULL; } -static struct sk_buff *unit_tcp6_gro_receive(struct list_head *held_list, +static struct sk_buff **unit_tcp6_gro_receive(struct sk_buff **gro_list, struct sk_buff *skb) { UNIT_LOG("; ", "unit_tcp6_gro_receive"); @@ -31,16 +31,13 @@ FIXTURE(homa_offload) struct homa_sock hsk; struct in6_addr ip; struct homa_data_hdr header; - struct napi_struct napi; + struct sk_buff *gro_list; struct sk_buff *skb, *skb2; - struct list_head empty_list; struct net_offload tcp_offloads; struct net_offload tcp6_offloads; }; FIXTURE_SETUP(homa_offload) { - int i; - homa_init(&self->homa); self->hnet = mock_hnet(0, &self->homa); self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; @@ -58,28 +55,22 @@ FIXTURE_SETUP(homa_offload) self->header.message_length = htonl(10000); self->header.incoming = htonl(10000); self->header.seg.offset = htonl(2000); - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - INIT_LIST_HEAD(&self->napi.gro.hash[i].list); - self->napi.gro.hash[i].count = 0; - } - self->napi.gro.bitmask = 0; self->skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 2000); NAPI_GRO_CB(self->skb)->same_flow = 0; + ((struct iphdr *) skb_network_header(self->skb))->protocol = IPPROTO_HOMA+1; + NAPI_GRO_CB(self->skb)->data_offset = sizeof(struct homa_data_hdr); NAPI_GRO_CB(self->skb)->last = self->skb; - NAPI_GRO_CB(self->skb)->count = 1; self->header.seg.offset = htonl(4000); self->header.common.dport = htons(88); self->header.common.sender_id = cpu_to_be64(1002); self->skb2 = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(self->skb2)->same_flow = 0; + NAPI_GRO_CB(self->skb2)->data_offset = sizeof(struct homa_data_hdr); NAPI_GRO_CB(self->skb2)->last = self->skb2; - NAPI_GRO_CB(self->skb2)->count = 1; - self->napi.gro.bitmask = 6; - self->napi.gro.hash[2].count = 2; - list_add_tail(&self->skb->list, &self->napi.gro.hash[2].list); - list_add_tail(&self->skb2->list, &self->napi.gro.hash[2].list); - INIT_LIST_HEAD(&self->empty_list); + self->gro_list = self->skb; + self->skb->next = self->skb2; + self->skb2->next = NULL; self->tcp_offloads.callbacks.gro_receive = test_tcp_gro_receive; inet_offloads[IPPROTO_TCP] = &self->tcp_offloads; self->tcp6_offloads.callbacks.gro_receive = unit_tcp6_gro_receive; @@ -95,11 +86,12 @@ FIXTURE_SETUP(homa_offload) } FIXTURE_TEARDOWN(homa_offload) { - struct sk_buff *skb, *tmp; - homa_offload_end(); - list_for_each_entry_safe(skb, tmp, &self->napi.gro.hash[2].list, list) - kfree_skb(skb); + while (self->gro_list) { + struct sk_buff *next = self->gro_list->next; + kfree_skb(self->gro_list); + self->gro_list = next; + } homa_destroy(&self->homa); unit_teardown(); } @@ -139,7 +131,7 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_tcp) skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); h = (struct homa_common_hdr *) skb_transport_header(skb); h->flags = 0; - EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); + EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->gro_list, skb)); EXPECT_STREQ("test_tcp_gro_receive", unit_log_get()); kfree_skb(skb); unit_log_clear(); @@ -147,7 +139,7 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_tcp) skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); h = (struct homa_common_hdr *)skb_transport_header(skb); h->urgent -= 1; - EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); + EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->gro_list, skb)); EXPECT_STREQ("test_tcp_gro_receive", unit_log_get()); kfree_skb(skb); homa_gro_unhook_tcp(); @@ -166,13 +158,11 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv6) h->flags = HOMA_TCP_FLAGS; h->urgent = htons(HOMA_TCP_URGENT); NAPI_GRO_CB(skb)->same_flow = 0; - cur_offload_core->held_skb = NULL; - cur_offload_core->held_bucket = 99; - EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); - EXPECT_EQ(skb, cur_offload_core->held_skb); - EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->gro_list, skb)); EXPECT_EQ(IPPROTO_HOMA, ipv6_hdr(skb)->nexthdr); - kfree_skb(skb); + EXPECT_STREQ("", unit_log_get()); + unit_log_frag_list(self->gro_list, 0); + EXPECT_STREQ("DATA 1400@6000", unit_log_get()); homa_gro_unhook_tcp(); } TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv4) @@ -182,24 +172,33 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv4) mock_ipv6 = false; homa_gro_hook_tcp(); - self->header.seg.offset = htonl(6000); + self->header.seg.offset = htonl(7000); skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); ip_hdr(skb)->protocol = IPPROTO_TCP; h = (struct homa_common_hdr *)skb_transport_header(skb); h->flags = HOMA_TCP_FLAGS; h->urgent = htons(HOMA_TCP_URGENT); NAPI_GRO_CB(skb)->same_flow = 0; - cur_offload_core->held_skb = NULL; - cur_offload_core->held_bucket = 99; - EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); - EXPECT_EQ(skb, cur_offload_core->held_skb); - EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->gro_list, skb)); EXPECT_EQ(IPPROTO_HOMA, ip_hdr(skb)->protocol); + EXPECT_STREQ("", unit_log_get()); + unit_log_frag_list(self->gro_list, 0); + EXPECT_STREQ("DATA 1400@7000", unit_log_get()); EXPECT_EQ(29695, ip_hdr(skb)->check); - kfree_skb(skb); homa_gro_unhook_tcp(); } +TEST_F(homa_offload, homa_set_softirq_cpu) +{ + struct sk_buff *skb; + + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); + homa_set_softirq_cpu(skb, 3); + EXPECT_EQ(rps_cpu_mask + 4, skb->hash); + + kfree_skb(skb); +} + TEST_F(homa_offload, homa_gso_segment_set_ip_ids) { struct sk_buff *skb, *segs; @@ -230,9 +229,7 @@ TEST_F(homa_offload, homa_gro_receive__update_offset_from_sequence) self->header.seg.offset = -1; skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb)->same_flow = 0; - cur_offload_core->held_skb = NULL; - cur_offload_core->held_bucket = 99; - EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb)); + EXPECT_EQ(NULL, homa_gro_receive(&self->gro_list, skb)); h = (struct homa_data_hdr *) skb_transport_header(skb); EXPECT_EQ(6000, htonl(h->seg.offset)); @@ -241,18 +238,16 @@ TEST_F(homa_offload, homa_gro_receive__update_offset_from_sequence) self->header.seg.offset = ntohl(5000); skb2 = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb2)->same_flow = 0; - EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb2)); + EXPECT_EQ(NULL, homa_gro_receive(&self->gro_list, skb2)); h = (struct homa_data_hdr *)skb_transport_header(skb2); EXPECT_EQ(5000, htonl(h->seg.offset)); - - kfree_skb(skb); - kfree_skb(skb2); } TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) { struct in6_addr client_ip = unit_get_in_addr("196.168.0.1"); struct in6_addr server_ip = unit_get_in_addr("1.2.3.4"); - struct sk_buff *skb, *skb2, *skb3, *skb4, *result; + struct sk_buff *skb, *skb2, *skb3, *skb4; + struct sk_buff **result; int client_port = 40000; u64 client_id = 1234; u64 server_id = 1235; @@ -277,7 +272,7 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) /* First attempt: HOMA_GRO_SHORT_BYPASS not enabled. */ skb = mock_skb_alloc(&self->ip, &h.common, 1400, 2000); - result = homa_gro_receive(&self->empty_list, skb); + result = homa_gro_receive(&self->gro_list, skb); EXPECT_EQ(0, -PTR_ERR(result)); EXPECT_EQ(0, homa_metrics_per_cpu()->gro_data_bypasses); @@ -287,7 +282,7 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) self->homa.gro_policy |= HOMA_GRO_SHORT_BYPASS; cur_offload_core->last_gro = 400; skb2 = mock_skb_alloc(&self->ip, &h.common, 1400, 2000); - result = homa_gro_receive(&self->empty_list, skb2); + result = homa_gro_receive(&self->gro_list, skb2); EXPECT_EQ(0, -PTR_ERR(result)); EXPECT_EQ(0, homa_metrics_per_cpu()->gro_data_bypasses); @@ -296,26 +291,23 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) h.incoming = htonl(1400); cur_offload_core->last_gro = 400; skb3 = mock_skb_alloc(&self->ip, &h.common, 1400, 4000); - result = homa_gro_receive(&self->empty_list, skb3); + result = homa_gro_receive(&self->gro_list, skb3); EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); EXPECT_EQ(1, homa_metrics_per_cpu()->gro_data_bypasses); - /* Third attempt: no bypass because core busy. */ + /* Fourth attempt: no bypass because core busy. */ cur_offload_core->last_gro = 600; skb4 = mock_skb_alloc(&self->ip, &h.common, 1400, 4000); - result = homa_gro_receive(&self->empty_list, skb3); + result = homa_gro_receive(&self->gro_list, skb4); EXPECT_EQ(0, -PTR_ERR(result)); EXPECT_EQ(1, homa_metrics_per_cpu()->gro_data_bypasses); - - kfree_skb(skb); - kfree_skb(skb2); - kfree_skb(skb4); } TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) { struct in6_addr client_ip = unit_get_in_addr("196.168.0.1"); struct in6_addr server_ip = unit_get_in_addr("1.2.3.4"); - struct sk_buff *skb, *skb2, *skb3, *result; + struct sk_buff *skb, *skb2, *skb3; + struct sk_buff **result; struct homa_grant_hdr h; int client_port = 40000; u64 client_id = 1234; @@ -341,7 +333,7 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) /* First attempt: HOMA_GRO_FAST_GRANTS not enabled. */ self->homa.gro_policy = 0; skb = mock_skb_alloc(&client_ip, &h.common, 0, 0); - result = homa_gro_receive(&self->empty_list, skb); + result = homa_gro_receive(&self->gro_list, skb); EXPECT_EQ(0, -PTR_ERR(result)); EXPECT_EQ(0, homa_metrics_per_cpu()->gro_grant_bypasses); EXPECT_STREQ("", unit_log_get()); @@ -350,7 +342,7 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) self->homa.gro_policy = HOMA_GRO_FAST_GRANTS; cur_offload_core->last_gro = 400; skb2 = mock_skb_alloc(&client_ip, &h.common, 0, 0); - result = homa_gro_receive(&self->empty_list, skb2); + result = homa_gro_receive(&self->gro_list, skb2); EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); EXPECT_EQ(1, homa_metrics_per_cpu()->gro_grant_bypasses); EXPECT_SUBSTR("xmit DATA 1400@10000", unit_log_get()); @@ -358,161 +350,120 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) /* Third attempt: core is too busy for fast grants. */ cur_offload_core->last_gro = 600; skb3 = mock_skb_alloc(&client_ip, &h.common, 0, 0); - result = homa_gro_receive(&self->empty_list, skb3); + result = homa_gro_receive(&self->gro_list, skb3); EXPECT_EQ(0, -PTR_ERR(result)); EXPECT_EQ(1, homa_metrics_per_cpu()->gro_grant_bypasses); - kfree_skb(skb); - kfree_skb(skb3); } -TEST_F(homa_offload, homa_gro_receive__no_held_skb) +TEST_F(homa_offload, homa_gro_receive__no_held_skbs) { + struct sk_buff *held_list = NULL; struct sk_buff *skb; int same_flow; self->header.seg.offset = htonl(6000); skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); - skb->hash = 2; NAPI_GRO_CB(skb)->same_flow = 0; - cur_offload_core->held_skb = NULL; - cur_offload_core->held_bucket = 2; - EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[2].list, skb)); + EXPECT_EQ(NULL, homa_gro_receive(&held_list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); - EXPECT_EQ(skb, cur_offload_core->held_skb); - EXPECT_EQ(2, cur_offload_core->held_bucket); kfree_skb(skb); } -TEST_F(homa_offload, homa_gro_receive__empty_merge_list) +TEST_F(homa_offload, homa_gro_receive__skip_held_skbs_that_arent_homa_packets) { struct sk_buff *skb; int same_flow; + if (skb_is_ipv6(self->gro_list)) + ipv6_hdr(self->gro_list)->nexthdr = IPPROTO_TCP; + else + ip_hdr(self->gro_list)->protocol = IPPROTO_TCP; + self->header.seg.offset = htonl(6000); skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); - skb->hash = 2; NAPI_GRO_CB(skb)->same_flow = 0; - cur_offload_core->held_skb = self->skb; - cur_offload_core->held_bucket = 3; - EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[2].list, skb)); + EXPECT_EQ(NULL, homa_gro_receive(&self->gro_list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; - EXPECT_EQ(0, same_flow); - EXPECT_EQ(skb, cur_offload_core->held_skb); - EXPECT_EQ(2, cur_offload_core->held_bucket); - kfree_skb(skb); + EXPECT_EQ(1, same_flow); + unit_log_clear(); + unit_log_frag_list(self->gro_list, 0); + EXPECT_STREQ("", + unit_log_get()); + unit_log_clear(); + unit_log_frag_list(self->gro_list->next, 0); + EXPECT_STREQ("DATA 1400@6000", + unit_log_get()); } -TEST_F(homa_offload, homa_gro_receive__held_skb_not_in_merge_list) +TEST_F(homa_offload, homa_gro_receive__add_to_frag_list) { struct sk_buff *skb; int same_flow; self->header.seg.offset = htonl(6000); skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); - skb->hash = 3; NAPI_GRO_CB(skb)->same_flow = 0; - cur_offload_core->held_skb = skb; - cur_offload_core->held_bucket = 2; - EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[3].list, skb)); + EXPECT_EQ(NULL, homa_gro_receive(&self->gro_list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; - EXPECT_EQ(0, same_flow); - EXPECT_EQ(skb, cur_offload_core->held_skb); - EXPECT_EQ(3, cur_offload_core->held_bucket); - kfree_skb(skb); -} -TEST_F(homa_offload, homa_gro_receive__held_skb__in_merge_list_but_wrong_proto) -{ - struct sk_buff *skb; - int same_flow; + EXPECT_EQ(1, same_flow); - self->header.seg.offset = htonl(6000); + self->header.seg.offset = htonl(7400); skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); - skb->hash = 3; NAPI_GRO_CB(skb)->same_flow = 0; - cur_offload_core->held_skb = self->skb; - if (skb_is_ipv6(self->skb)) - ipv6_hdr(self->skb)->nexthdr = IPPROTO_TCP; - else - ip_hdr(self->skb)->protocol = IPPROTO_TCP; - cur_offload_core->held_bucket = 2; - EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[3].list, skb)); + EXPECT_EQ(NULL, homa_gro_receive(&self->gro_list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; - EXPECT_EQ(0, same_flow); - EXPECT_EQ(skb, cur_offload_core->held_skb); - EXPECT_EQ(3, cur_offload_core->held_bucket); - kfree_skb(skb); + EXPECT_EQ(1, same_flow); + unit_log_clear(); + unit_log_frag_list(self->gro_list, 0); + EXPECT_STREQ("DATA 1400@6000; DATA 1400@7400", unit_log_get()); } -TEST_F(homa_offload, homa_gro_receive__merge) +TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) { - struct sk_buff *skb, *skb2; - int same_flow; + struct homa_common_hdr *h; + struct sk_buff *skb; - cur_offload_core->held_skb = self->skb2; - cur_offload_core->held_bucket = 2; + h = (struct homa_common_hdr *)skb_transport_header(self->gro_list); + // First packet fits below the limit. + self->homa.max_gro_skbs = 3; self->header.seg.offset = htonl(6000); - self->header.common.sender_id = cpu_to_be64(1002); skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); - NAPI_GRO_CB(skb)->same_flow = 0; - EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[3].list, skb)); - same_flow = NAPI_GRO_CB(skb)->same_flow; - EXPECT_EQ(1, same_flow); - EXPECT_EQ(2, NAPI_GRO_CB(self->skb2)->count); + EXPECT_EQ(NULL, homa_gro_receive(&self->gro_list, skb)); + EXPECT_EQ(1, NAPI_GRO_CB(self->gro_list)->count); + EXPECT_EQ(1, h->gro_count); - self->header.seg.offset = htonl(7000); - self->header.common.sender_id = cpu_to_be64(1004); - skb2 = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); - NAPI_GRO_CB(skb2)->same_flow = 0; - EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[3].list, skb2)); - same_flow = NAPI_GRO_CB(skb)->same_flow; - EXPECT_EQ(1, same_flow); - EXPECT_EQ(3, NAPI_GRO_CB(self->skb2)->count); + // Second packet also fits below the limit. + self->header.seg.offset = htonl(8000); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); + EXPECT_EQ(NULL, homa_gro_receive(&self->gro_list, skb)); + EXPECT_EQ(2, NAPI_GRO_CB(self->gro_list)->count); + EXPECT_EQ(2, h->gro_count); - unit_log_frag_list(self->skb2, 1); - EXPECT_STREQ("DATA from 196.168.0.1:40000, dport 88, id 1002, message_length 10000, offset 6000, data_length 1400, incoming 10000; " - "DATA from 196.168.0.1:40000, dport 88, id 1004, message_length 10000, offset 7000, data_length 1400, incoming 10000", - unit_log_get()); + // Third packet hits the limit. + self->header.seg.offset = htonl(10000); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); + EXPECT_EQ(&self->gro_list, homa_gro_receive(&self->gro_list, skb)); + EXPECT_EQ(3, NAPI_GRO_CB(self->gro_list)->count); + EXPECT_EQ(3, h->gro_count); } -TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) +TEST_F(homa_offload, homa_gro_receive__set_softirq_cpu) { + struct sk_buff *held_list = NULL; struct sk_buff *skb; - // First packet: fits below the limit. - self->homa.max_gro_skbs = 3; - cur_offload_core->held_skb = self->skb2; - cur_offload_core->held_bucket = 2; - self->header.seg.offset = htonl(6000); - skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); - homa_gro_receive(&self->napi.gro.hash[3].list, skb); - EXPECT_EQ(2, NAPI_GRO_CB(self->skb2)->count); - EXPECT_EQ(2, self->napi.gro.hash[2].count); + cpu_number = 5; - // Second packet hits the limit. - self->header.common.sport = htons(40001); + /* First call: HOMA_GRO_SAME_CORE not set. */ skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); - unit_log_clear(); - EXPECT_EQ(EINPROGRESS, -PTR_ERR(homa_gro_receive( - &self->napi.gro.hash[3].list, skb))); - EXPECT_EQ(3, NAPI_GRO_CB(self->skb2)->count); - EXPECT_EQ(1, self->napi.gro.hash[2].count); - EXPECT_STREQ("netif_receive_skb, id 1002, offset 4000", - unit_log_get()); - kfree_skb(self->skb2); - EXPECT_EQ(1, self->napi.gro.hash[2].count); - EXPECT_EQ(6, self->napi.gro.bitmask); - - // Third packet also hits the limit for skb, causing the bucket - // to become empty. - self->homa.max_gro_skbs = 2; - cur_offload_core->held_skb = self->skb; - skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); - unit_log_clear(); - EXPECT_EQ(EINPROGRESS, -PTR_ERR(homa_gro_receive( - &self->napi.gro.hash[3].list, skb))); - EXPECT_EQ(2, NAPI_GRO_CB(self->skb)->count); - EXPECT_EQ(0, self->napi.gro.hash[2].count); - EXPECT_EQ(2, self->napi.gro.bitmask); - EXPECT_STREQ("netif_receive_skb, id 1000, offset 2000", - unit_log_get()); - kfree_skb(self->skb); + NAPI_GRO_CB(skb)->same_flow = 0; + skb->hash = 0; + self->homa.gro_policy &= ~HOMA_GRO_SAME_CORE; + EXPECT_EQ(NULL, homa_gro_receive(&held_list, skb)); + EXPECT_EQ(0, skb->hash); + + /* Second call: HOMA_GRO_SAME_CORE set. */ + self->homa.gro_policy |= HOMA_GRO_SAME_CORE; + EXPECT_EQ(NULL, homa_gro_receive(&held_list, skb)); + EXPECT_EQ(rps_cpu_mask + 6, skb->hash); + kfree_skb(skb); } TEST_F(homa_offload, homa_gro_gen2) @@ -610,7 +561,6 @@ TEST_F(homa_offload, homa_gro_gen3__all_cores_busy_so_pick_first) EXPECT_EQ(5000, per_cpu(homa_offload_core, 3).last_active); } - TEST_F(homa_offload, homa_gro_complete__clear_held_skb) { struct homa_offload_core *offload_core = &per_cpu(homa_offload_core, diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index efe136ab..fdba35e8 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -1007,7 +1007,7 @@ TEST_F(homa_outgoing, __homa_xmit_data__fill_dst) #endif /* See strip.py */ unit_log_clear(); dst = crpc->peer->dst; - old_refcount = atomic_read(&dst->__rcuref.refcnt); + old_refcount = atomic_read(&dst->__refcnt); skb_get(crpc->msgout.packets); #ifndef __STRIP__ /* See strip.py */ @@ -1017,7 +1017,7 @@ TEST_F(homa_outgoing, __homa_xmit_data__fill_dst) #endif /* See strip.py */ EXPECT_STREQ("xmit DATA 1000@0", unit_log_get()); EXPECT_EQ(dst, skb_dst(crpc->msgout.packets)); - EXPECT_EQ(old_refcount+1, atomic_read(&dst->__rcuref.refcnt)); + EXPECT_EQ(old_refcount+1, atomic_read(&dst->__refcnt)); } #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, __homa_xmit_data__ipv4_transmit_error) diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index da09c673..073adceb 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -173,11 +173,10 @@ TEST_F(homa_peer, homa_peer_release_fn) peer = homa_peer_alloc(&self->hsk, ip3333); dst = peer->dst; dst_hold(dst); - EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); - homa_peer_release(peer); + EXPECT_EQ(2, atomic_read(&dst->__refcnt)); homa_peer_release_fn(peer, NULL); - EXPECT_EQ(1, atomic_read(&dst->__rcuref.refcnt)); + EXPECT_EQ(1, atomic_read(&dst->__refcnt)); dst_release(dst); } @@ -482,7 +481,7 @@ TEST_F(homa_peer, homa_peer_alloc__success) EXPECT_EQ(0, peer->cutoff_version); EXPECT_EQ(1, homa_metrics_per_cpu()->peer_allocs); #endif /* See strip.py */ - EXPECT_EQ(1, atomic_read(&peer->dst->__rcuref.refcnt)); + EXPECT_EQ(1, atomic_read(&peer->dst->__refcnt)); homa_peer_release(peer); } TEST_F(homa_peer, homa_peer_alloc__kmalloc_error) @@ -520,10 +519,10 @@ TEST_F(homa_peer, homa_peer_free) ASSERT_FALSE(IS_ERR(peer)); dst = peer->dst; dst_hold(dst); - ASSERT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + ASSERT_EQ(2, atomic_read(&dst->__refcnt)); homa_peer_release(peer); - ASSERT_EQ(1, atomic_read(&dst->__rcuref.refcnt)); + ASSERT_EQ(1, atomic_read(&dst->__refcnt)); dst_release(dst); } @@ -612,7 +611,7 @@ TEST_F(homa_peer, homa_get_dst__normal) struct dst_entry *dst; dst = homa_get_dst(peer, &self->hsk); - EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + EXPECT_EQ(2, atomic_read(&dst->__refcnt)); IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->peer_dst_refreshes)); dst_release(dst); homa_peer_release(peer); @@ -626,7 +625,7 @@ TEST_F(homa_peer, homa_get_dst__must_refresh_obsolete) peer->dst->obsolete = 1; mock_dst_check_errors = 1; dst = homa_get_dst(peer, &self->hsk); - EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + EXPECT_EQ(2, atomic_read(&dst->__refcnt)); IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->peer_dst_refreshes)); EXPECT_NE(old, dst); dst_release(dst); @@ -642,7 +641,7 @@ TEST_F(homa_peer, homa_get_dst__multiple_refresh_failures) mock_dst_check_errors = 0xf; mock_route_errors = 0xf; dst = homa_get_dst(peer, &self->hsk); - EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + EXPECT_EQ(2, atomic_read(&dst->__refcnt)); IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->peer_dst_refreshes)); EXPECT_EQ(old, dst); EXPECT_EQ(3, mock_dst_check_errors); diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 8c1a292a..9189617c 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -41,7 +41,6 @@ FIXTURE(homa_plumbing) { struct msghdr sendmsg_hdr; struct homa_sendmsg_args sendmsg_args; char buffer[2000]; - sockptr_t optval; union sockaddr_in_union addr; }; FIXTURE_SETUP(homa_plumbing) @@ -98,11 +97,8 @@ FIXTURE_SETUP(homa_plumbing) 2, 200); self->sendmsg_hdr.msg_control = &self->sendmsg_args; self->sendmsg_hdr.msg_controllen = sizeof(self->sendmsg_args); - self->sendmsg_hdr.msg_control_is_user = 1; self->sendmsg_args.id = 0; self->sendmsg_args.completion_cookie = 0; - self->optval.user = (void *) 0x100000; - self->optval.is_kernel = 0; unit_log_clear(); if (self->homa.wmem_max == 0) printf("homa_plumbing fixture set wmem_max 0\n"); @@ -510,14 +506,14 @@ TEST_F(homa_plumbing, homa_socket__homa_sock_init_failure) TEST_F(homa_plumbing, homa_setsockopt__bad_level) { EXPECT_EQ(ENOPROTOOPT, -homa_setsockopt(&self->hsk.sock, 0, 0, - self->optval, sizeof(struct homa_rcvbuf_args))); + (void *)0x100000, sizeof(struct homa_rcvbuf_args))); EXPECT_STREQ("homa_setsockopt invoked with level not IPPROTO_HOMA", self->hsk.error_msg); } TEST_F(homa_plumbing, homa_setsockopt__recvbuf_bad_optlen) { EXPECT_EQ(EINVAL, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_RCVBUF, self->optval, + SO_HOMA_RCVBUF, (void *)0x100000, sizeof(struct homa_rcvbuf_args) - 1)); EXPECT_STREQ("invalid optlen argument: must be sizeof(struct homa_rcvbuf_args)", self->hsk.error_msg); @@ -526,7 +522,7 @@ TEST_F(homa_plumbing, homa_setsockopt__recvbuf_copy_from_sockptr_fails) { mock_copy_data_errors = 1; EXPECT_EQ(EFAULT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_RCVBUF, self->optval, + SO_HOMA_RCVBUF, (void *)0x100000, sizeof(struct homa_rcvbuf_args))); EXPECT_STREQ("invalid address for homa_rcvbuf_args", self->hsk.error_msg); @@ -535,10 +531,9 @@ TEST_F(homa_plumbing, homa_setsockopt__recvbuf_region_not_writable) { struct homa_rcvbuf_args args = {0x100000, 5*HOMA_BPAGE_SIZE}; - self->optval.user = &args; mock_copy_to_user_errors = 1; EXPECT_EQ(EFAULT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_RCVBUF, self->optval, + SO_HOMA_RCVBUF, (void *)&args, sizeof(struct homa_rcvbuf_args))); EXPECT_STREQ("receive buffer region is not writable", self->hsk.error_msg); @@ -551,11 +546,10 @@ TEST_F(homa_plumbing, homa_setsockopt__recvbuf_success) args.start = (((uintptr_t)(buffer + PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1)); args.length = 64*HOMA_BPAGE_SIZE; - self->optval.user = &args; homa_pool_free(self->hsk.buffer_pool); self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); EXPECT_EQ(0, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_RCVBUF, self->optval, + SO_HOMA_RCVBUF, (void *)&args, sizeof(struct homa_rcvbuf_args))); EXPECT_EQ(args.start, (uintptr_t)self->hsk.buffer_pool->region); EXPECT_EQ(64, self->hsk.buffer_pool->num_bpages); @@ -566,7 +560,7 @@ TEST_F(homa_plumbing, homa_setsockopt__recvbuf_success) TEST_F(homa_plumbing, homa_setsockopt__server_bad_optlen) { EXPECT_EQ(EINVAL, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_SERVER, self->optval, sizeof(int) - 1)); + SO_HOMA_SERVER, (void *)0x100000, sizeof(int) - 1)); EXPECT_STREQ("invalid optlen argument: must be sizeof(int)", self->hsk.error_msg); } @@ -574,7 +568,7 @@ TEST_F(homa_plumbing, homa_setsockopt__server_copy_from_sockptr_fails) { mock_copy_data_errors = 1; EXPECT_EQ(EFAULT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_SERVER, self->optval, sizeof(int))); + SO_HOMA_SERVER, (void *)0x100000, sizeof(int))); EXPECT_STREQ("invalid address for SO_HOMA_SERVER value", self->hsk.error_msg); } @@ -582,20 +576,19 @@ TEST_F(homa_plumbing, homa_setsockopt__server_success) { int arg = 7; - self->optval.user = &arg; EXPECT_EQ(0, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_SERVER, self->optval, sizeof(int))); + SO_HOMA_SERVER, (void *)&arg, sizeof(int))); EXPECT_EQ(1, self->hsk.is_server); arg = 0; EXPECT_EQ(0, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_SERVER, self->optval, sizeof(int))); + SO_HOMA_SERVER, (void *)&arg, sizeof(int))); EXPECT_EQ(0, self->hsk.is_server); } TEST_F(homa_plumbing, homa_setsockopt__bad_optname) { EXPECT_EQ(ENOPROTOOPT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, 0, - self->optval, sizeof(struct homa_rcvbuf_args))); + (void *)0x100000, sizeof(struct homa_rcvbuf_args))); EXPECT_STREQ("setsockopt option not supported by Homa", self->hsk.error_msg); } @@ -722,15 +715,6 @@ TEST_F(homa_plumbing, homa_sendmsg__msg_name_null) self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } -TEST_F(homa_plumbing, homa_sendmsg__msg_control_not_in_user_space) -{ - self->sendmsg_hdr.msg_control_is_user = 0; - EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, - &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); - EXPECT_STREQ("msg_control argument for sendmsg isn't in user space", - self->hsk.error_msg); - EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); -} TEST_F(homa_plumbing, homa_sendmsg__cant_read_msg_control) { mock_copy_data_errors = 1; @@ -955,7 +939,7 @@ TEST_F(homa_plumbing, homa_recvmsg__wrong_args_length) { self->recvmsg_hdr.msg_controllen -= 1; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_STREQ("invalid msg_controllen in recvmsg", self->hsk.error_msg); } @@ -963,7 +947,7 @@ TEST_F(homa_plumbing, homa_recvmsg__cant_read_args) { mock_copy_data_errors = 1; EXPECT_EQ(EFAULT, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_STREQ("invalid address for msg_control argument to recvmsg", self->hsk.error_msg); } @@ -974,21 +958,21 @@ TEST_F(homa_plumbing, homa_recvmsg__clear_cookie) self->recvmsg_args.completion_cookie = 12345; self->recvmsg_args.num_bpages = 1000000; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(0, self->recvmsg_args.completion_cookie); } TEST_F(homa_plumbing, homa_recvmsg__num_bpages_too_large) { self->recvmsg_args.num_bpages = HOMA_MAX_BPAGES + 1; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_STREQ("num_pages exceeds HOMA_MAX_BPAGES", self->hsk.error_msg); } TEST_F(homa_plumbing, homa_recvmsg__reserved_not_zero) { self->recvmsg_args.reserved = 1; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_STREQ("reserved fields in homa_recvmsg_args must be zero", self->hsk.error_msg); } @@ -998,7 +982,7 @@ TEST_F(homa_plumbing, homa_recvmsg__no_buffer_pool) self->hsk.buffer_pool = NULL; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_STREQ("SO_HOMA_RECVBUF socket option has not been set", self->hsk.error_msg); self->hsk.buffer_pool = saved_pool; @@ -1014,7 +998,7 @@ TEST_F(homa_plumbing, homa_recvmsg__release_buffers) self->recvmsg_args.bpage_offsets[1] = HOMA_BPAGE_SIZE; EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + 0, MSG_DONTWAIT, 1, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(0, atomic_read(&self->hsk.buffer_pool->descriptors[0].refs)); EXPECT_EQ(0, atomic_read(&self->hsk.buffer_pool->descriptors[1].refs)); } @@ -1025,7 +1009,7 @@ TEST_F(homa_plumbing, homa_recvmsg__error_in_release_buffers) self->hsk.buffer_pool->num_bpages << HOMA_BPAGE_SHIFT; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + 0, MSG_DONTWAIT, 1, &self->recvmsg_hdr.msg_namelen)); EXPECT_STREQ("error while releasing buffer pages", self->hsk.error_msg); } @@ -1034,7 +1018,7 @@ TEST_F(homa_plumbing, homa_recvmsg__private_rpc_doesnt_exist) self->recvmsg_args.id = 99; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_STREQ("invalid RPC id passed to recvmsg", self->hsk.error_msg); } @@ -1050,7 +1034,7 @@ TEST_F(homa_plumbing, homa_recvmsg__error_from_homa_wait_private) self->recvmsg_args.id = crpc->id; EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + 0, MSG_DONTWAIT, 1, &self->recvmsg_hdr.msg_namelen)); EXPECT_STREQ("error while waiting for private RPC to complete", self->hsk.error_msg); EXPECT_EQ(0, self->recvmsg_args.id); @@ -1069,7 +1053,7 @@ TEST_F(homa_plumbing, homa_recvmsg__private_rpc_has_error) self->recvmsg_args.id = crpc->id; EXPECT_EQ(ETIMEDOUT, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + 0, MSG_DONTWAIT, 1, &self->recvmsg_hdr.msg_namelen)); EXPECT_STREQ("RPC failed", self->hsk.error_msg); EXPECT_EQ(self->client_id, self->recvmsg_args.id); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); @@ -1077,7 +1061,7 @@ TEST_F(homa_plumbing, homa_recvmsg__private_rpc_has_error) TEST_F(homa_plumbing, homa_recvmsg__error_from_homa_wait_shared) { EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + 0, MSG_DONTWAIT, 1, &self->recvmsg_hdr.msg_namelen)); EXPECT_STREQ("error while waiting for shared RPC to complete", self->hsk.error_msg); } @@ -1090,7 +1074,7 @@ TEST_F(homa_plumbing, homa_recvmsg__MSG_DONT_WAIT) EXPECT_NE(NULL, crpc); EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, - &self->recvmsg_hdr, 0, MSG_DONTWAIT, + &self->recvmsg_hdr, 0, MSG_DONTWAIT, 1, &self->recvmsg_hdr.msg_namelen)); EXPECT_STREQ("error while waiting for shared RPC to complete", self->hsk.error_msg); @@ -1114,7 +1098,7 @@ TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv4) crpc->completion_cookie = 44444; EXPECT_EQ(2000, homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(self->client_id, self->recvmsg_args.id); EXPECT_EQ(44444, self->recvmsg_args.completion_cookie); EXPECT_EQ(AF_INET, self->addr.in4.sin_family); @@ -1145,7 +1129,7 @@ TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv6) crpc->completion_cookie = 44444; EXPECT_EQ(2000, homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(self->client_id, self->recvmsg_args.id); EXPECT_EQ(44444, self->recvmsg_args.completion_cookie); EXPECT_EQ(AF_INET6, self->addr.in6.sin6_family); @@ -1168,7 +1152,7 @@ TEST_F(homa_plumbing, homa_recvmsg__rpc_has_error) homa_rpc_abort(crpc, -ETIMEDOUT); EXPECT_EQ(ETIMEDOUT, -homa_recvmsg(&self->hsk.inet.sk, - &self->recvmsg_hdr, 0, 0, + &self->recvmsg_hdr, 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_STREQ("RPC failed", self->hsk.error_msg); EXPECT_EQ(self->client_id, self->recvmsg_args.id); @@ -1192,7 +1176,7 @@ TEST_F(homa_plumbing, homa_recvmsg__add_ack) peer = crpc->peer; EXPECT_EQ(2000, homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(1, peer->num_acks); } TEST_F(homa_plumbing, homa_recvmsg__server_normal_completion) @@ -1203,7 +1187,7 @@ TEST_F(homa_plumbing, homa_recvmsg__server_normal_completion) EXPECT_NE(NULL, srpc); EXPECT_EQ(100, homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(self->server_id, self->recvmsg_args.id); EXPECT_EQ(RPC_IN_SERVICE, srpc->state); EXPECT_EQ(0, srpc->peer->num_acks); @@ -1219,7 +1203,7 @@ TEST_F(homa_plumbing, homa_recvmsg__delete_server_rpc_after_error) srpc->error = -ENOMEM; EXPECT_EQ(ENOMEM, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(self->server_id, self->recvmsg_args.id); EXPECT_EQ(RPC_DEAD, srpc->state); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); @@ -1239,7 +1223,7 @@ TEST_F(homa_plumbing, homa_recvmsg__reap_because_of_SOCK_NOSPACE) set_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags); EXPECT_EQ(2000, homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(1, refcount_read(&self->hsk.sock.sk_wmem_alloc)); EXPECT_EQ(0, self->hsk.dead_skbs); IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->reaper_calls)); @@ -1255,7 +1239,7 @@ TEST_F(homa_plumbing, homa_recvmsg__error_copying_out_args) mock_copy_to_user_errors = 1; EXPECT_EQ(EFAULT, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_STREQ("couldn't update homa_recvmsg_args argument to recvmsg: read-only?", self->hsk.error_msg); EXPECT_EQ(0, self->recvmsg_args.id); @@ -1272,7 +1256,7 @@ TEST_F(homa_plumbing, homa_recvmsg__copy_back_args_even_after_error) self->recvmsg_args.bpage_offsets[1] = HOMA_BPAGE_SIZE; EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + 0, MSG_DONTWAIT, 1, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(0, self->recvmsg_args.num_bpages); } @@ -1478,7 +1462,7 @@ TEST_F(homa_plumbing, homa_err_handler_v4__port_unreachable) icmp->data = skb_tail_pointer(icmp); memcpy(skb_put(icmp, failed->len), failed->head, failed->len); - EXPECT_EQ(0, homa_err_handler_v4(icmp, 111)); + homa_err_handler_v4(icmp, 111); EXPECT_EQ(ENOTCONN, -crpc->error); kfree_skb(icmp); @@ -1506,7 +1490,7 @@ TEST_F(homa_plumbing, homa_err_handler_v4__host_unreachable) icmp->data = skb_tail_pointer(icmp); memcpy(skb_put(icmp, failed->len), failed->head, failed->len); - EXPECT_EQ(0, homa_err_handler_v4(icmp, 111)); + homa_err_handler_v4(icmp, 111); EXPECT_EQ(EHOSTUNREACH, -crpc->error); kfree_skb(icmp); @@ -1530,8 +1514,8 @@ TEST_F(homa_plumbing, homa_err_handler_v6__port_unreachable) memcpy(skb_put(icmp, failed->len), skb_network_header(failed), failed->len); - EXPECT_EQ(0, homa_err_handler_v6(icmp, NULL, ICMPV6_DEST_UNREACH, - ICMPV6_PORT_UNREACH, 0, 111)); + homa_err_handler_v6(icmp, NULL, ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, 0, 111); EXPECT_EQ(ENOTCONN, -crpc->error); kfree_skb(icmp); @@ -1554,8 +1538,8 @@ TEST_F(homa_plumbing, homa_err_handler_v6__protocol_not_supported) memcpy(skb_put(icmp, failed->len), skb_network_header(failed), failed->len); - EXPECT_EQ(0, homa_err_handler_v6(icmp, NULL, ICMPV6_PARAMPROB, - ICMPV6_UNK_NEXTHDR, 0, 111)); + homa_err_handler_v6(icmp, NULL, ICMPV6_PARAMPROB, + ICMPV6_UNK_NEXTHDR, 0, 111); EXPECT_EQ(EPROTONOSUPPORT, -crpc->error); kfree_skb(icmp); diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index f4a9dfc2..394f0964 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -237,7 +237,7 @@ TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_allocate_new_page) EXPECT_NE(NULL, p3); EXPECT_EQ(1000, length); EXPECT_EQ(2, skb_shinfo(self->skb)->nr_frags); - EXPECT_EQ(0, skb_shinfo(self->skb)->frags[1].offset); + EXPECT_EQ(0, skb_shinfo(self->skb)->frags[1].page_offset); EXPECT_EQ(2000, self->skb->len); EXPECT_EQ(1000, skb_core->page_inuse); @@ -267,7 +267,7 @@ TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_use_same_page_reduce_length) EXPECT_EQ(p2 + 512, p3); EXPECT_EQ(512, length); EXPECT_EQ(2, skb_shinfo(self->skb)->nr_frags); - EXPECT_EQ(1536, skb_shinfo(self->skb)->frags[1].offset); + EXPECT_EQ(1536, skb_shinfo(self->skb)->frags[1].page_offset); EXPECT_EQ(2048, skb_core->page_inuse); kfree_skb(skb2); @@ -397,13 +397,13 @@ TEST_F(homa_skb, homa_skb_append_to_frag__basics) EXPECT_EQ(2, shinfo->nr_frags); EXPECT_EQ(10, skb_frag_size(&shinfo->frags[0])); p = ((char *) page_address(skb_frag_page(&shinfo->frags[0]))) - + shinfo->frags[0].offset; + + shinfo->frags[0].page_offset; p[skb_frag_size(&shinfo->frags[0])] = 0; EXPECT_STREQ("abcd012345", p); EXPECT_EQ(15, skb_frag_size(&shinfo->frags[1])); p = ((char *) page_address(skb_frag_page(&shinfo->frags[1]))) - + shinfo->frags[1].offset; + + shinfo->frags[1].page_offset; EXPECT_STREQ("6789ABCDEFGHIJ", p); } TEST_F(homa_skb, homa_skb_append_to_frag__no_memory) diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index dd83df82..dc98c2ff 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -198,13 +198,16 @@ TEST_F(homa_sock, homa_sock_init__ip_header_length) TEST_F(homa_sock, homa_sock_init__hijack_tcp) { struct homa_sock hijack, no_hijack; + int protocol; self->homa.hijack_tcp = 0; mock_sock_init(&no_hijack, self->hnet, 0); self->homa.hijack_tcp = 1; mock_sock_init(&hijack, self->hnet, 0); - EXPECT_EQ(0, no_hijack.sock.sk_protocol); - EXPECT_EQ(IPPROTO_TCP, hijack.sock.sk_protocol); + protocol = no_hijack.sock.sk_protocol; + EXPECT_EQ(0, protocol); + protocol = hijack.sock.sk_protocol; + EXPECT_EQ(IPPROTO_TCP, protocol); unit_sock_destroy(&hijack); unit_sock_destroy(&no_hijack); } diff --git a/timetrace.c b/timetrace.c index e32b809a..702cf3e8 100644 --- a/timetrace.c +++ b/timetrace.c @@ -50,11 +50,11 @@ struct tt_buffer *tt_buffers[NR_CPUS]; /* Describes file operations implemented for reading timetraces * from /proc. */ -static const struct proc_ops tt_pops = { - .proc_open = tt_proc_open, - .proc_read = tt_proc_read, - .proc_lseek = tt_proc_lseek, - .proc_release = tt_proc_release +static const struct file_operations tt_pops = { + .open = tt_proc_open, + .read = tt_proc_read, + .llseek = tt_proc_lseek, + .release = tt_proc_release }; /* Used to remove the /proc file during tt_destroy. */ From b51e3e5dcf0d17077687e470de4daa3400417a0d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 12 Mar 2026 21:35:02 -0700 Subject: [PATCH 2/6] Remove LINUX_VERSION specs from Makefiles Otherwise won't compile on generic 4.18.0 systems. --- Makefile | 3 +-- test/Makefile | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 401ad45c..00fdb262 100644 --- a/Makefile +++ b/Makefile @@ -33,8 +33,7 @@ ifneq ($(KERNEL_SRC),) KDIR ?= $(KERNEL_SRC) endif -# LINUX_VERSION ?= $(shell uname -r) -LINUX_VERSION := 4.18.0+ +LINUX_VERSION ?= $(shell uname -r) KDIR ?= /lib/modules/$(LINUX_VERSION)/build CC = gcc-8 diff --git a/test/Makefile b/test/Makefile index 8b7e037d..c3ae5acc 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,7 +1,6 @@ # Makefile to run unit tests for Homa -LINUX_VERSION := 4.18.0+ -# LINUX_VERSION ?= $(shell uname -r) +LINUX_VERSION ?= $(shell uname -r) KDIR ?= /lib/modules/$(LINUX_VERSION)/build LINUX_SRC_DIR ?= /ouster/linux-stable CC ?= gcc From bba532e7c8c5f42f5fc4ef326a36f1a96197e2d2 Mon Sep 17 00:00:00 2001 From: AK Date: Sat, 14 Mar 2026 22:53:27 -0500 Subject: [PATCH 3/6] Fix for RHEL kernel - 4.18.0-553.30.1.el8_10.x86_6 This code hasn't actually been compiled or tested on RHEL yet (a slightly different version was compiled but did not work). --- Makefile | 3 +- homa_impl.h | 4 +- homa_offload.c | 163 ++++++++++++++++++++++++++++++++---------------- homa_offload.h | 4 +- homa_peer.c | 6 +- homa_plumbing.c | 16 ++++- 6 files changed, 133 insertions(+), 63 deletions(-) diff --git a/Makefile b/Makefile index 00fdb262..61d546cc 100644 --- a/Makefile +++ b/Makefile @@ -35,7 +35,8 @@ endif LINUX_VERSION ?= $(shell uname -r) KDIR ?= /lib/modules/$(LINUX_VERSION)/build -CC = gcc-8 +#CC = gcc-8 +CC = gcc LINUX_SRC_DIR ?= ../net-next diff --git a/homa_impl.h b/homa_impl.h index f88f32c3..4b203408 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -715,8 +715,8 @@ int homa_copy_to_user(struct homa_rpc *rpc); void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc); void homa_destroy(struct homa *homa); void homa_dispatch_pkts(struct sk_buff *skb); -void homa_err_handler_v4(struct sk_buff *skb, u32 info); -void homa_err_handler_v6(struct sk_buff *skb, +int homa_err_handler_v4(struct sk_buff *skb, u32 info); +int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info); int homa_fill_data_interleaved(struct homa_rpc *rpc, diff --git a/homa_offload.c b/homa_offload.c index 04b79ae7..9e38561a 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -133,11 +133,11 @@ void homa_gro_unhook_tcp(void) * homa_tcp_gro_receive() - Invoked instead of TCP's normal gro_receive function * when hooking is enabled. Identifies Homa-over-TCP packets and passes them * to Homa; sends real TCP packets to TCP's gro_receive function. - * @gro_list: Pointer to pointer to first in list of packets that are being + * @held_list: Pointer to header for list of packets that are being * held for possible GRO merging. * @skb: The newly arrived packet. */ -struct sk_buff **homa_tcp_gro_receive(struct sk_buff **gro_list, +struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, struct sk_buff *skb) { struct homa_common_hdr *h = (struct homa_common_hdr *) @@ -148,7 +148,7 @@ struct sk_buff **homa_tcp_gro_receive(struct sk_buff **gro_list, // ntohs(h->urgent), homa_local_id(h->sender_id)); if (h->flags != HOMA_TCP_FLAGS || ntohs(h->urgent) != HOMA_TCP_URGENT) - return tcp_net_offload->callbacks.gro_receive(gro_list, skb); + return tcp_net_offload->callbacks.gro_receive(held_list, skb); /* Change the packet's IP protocol to Homa so that it will get * dispatched directly to Homa in the future. @@ -161,7 +161,7 @@ struct sk_buff **homa_tcp_gro_receive(struct sk_buff **gro_list, htons(IPPROTO_HOMA)); ip_hdr(skb)->protocol = IPPROTO_HOMA; } - return homa_gro_receive(gro_list, skb); + return homa_gro_receive(held_list, skb); } /** @@ -267,15 +267,17 @@ struct sk_buff *homa_gso_segment(struct sk_buff *skb, * unusual way: it simply aggregates all packets targeted to a particular * destination port, so that the entire bundle can get through the networking * stack in a single traversal. - * @gro_list: Pointer to pointer to first in list of packets that are being - * held for possible GRO merging. + * @held_list: Pointer to header for list of packets that are being + * held for possible GRO merging. Note: this list contains + * only packets matching a given hash. * @skb: The newly arrived packet. * - * Return: If the return value is non-NULL, it refers to a link in - * gro_list. The skb referred to by that link should be removed from the - * list by the caller and passed up the stack immediately. + * Return: If the return value is non-NULL, it refers to an skb in + * gro_list. The skb will be removed from the list by the caller and + * passed up the stack immediately. */ -struct sk_buff **homa_gro_receive(struct sk_buff **gro_list, struct sk_buff *skb) +struct sk_buff *homa_gro_receive(struct list_head *held_list, + struct sk_buff *skb) { /* This function will do one of the following things: * 1. Merge skb with a packet in gro_list by appending it to @@ -290,14 +292,14 @@ struct sk_buff **homa_gro_receive(struct sk_buff **gro_list, struct sk_buff *skb struct homa *homa = homa_net(dev_net(skb->dev))->homa; u64 saved_softirq_metric, softirq_cycles; struct homa_offload_core *offload_core; - struct sk_buff **result = NULL; + struct sk_buff *result = NULL; struct homa_data_hdr *h_new; u64 *softirq_cycles_metric; struct sk_buff *held_skb; u64 now = homa_clock(); - struct sk_buff **pp; int priority; u32 saddr; + u32 hash; int busy; if (!homa_make_header_avl(skb)) @@ -359,54 +361,109 @@ struct sk_buff **homa_gro_receive(struct sk_buff **gro_list, struct sk_buff *skb #endif /* See strip.py */ } - h_new->common.gro_count = 1; - for (pp = gro_list; (held_skb = *pp) != NULL; pp = &held_skb->next) { - struct homa_common_hdr *h_held; - int protocol; - - h_held = (struct homa_common_hdr *)skb_transport_header( - held_skb); - - /* Packets can be batched together as long as they are all - * Homa packets, even if they are from different RPCs. Don't - * use the same_flow mechanism that is normally used in - * gro_receive, because it won't allow packets from different - * sources to be aggregated. + /* The GRO mechanism tries to separate packets onto different + * gro_lists by hash. This is bad for us, because we want to batch + * packets together regardless of their RPCs. So, instead of + * checking the list they gave us, check the last list where this + * core added a Homa packet (if there is such a list). + */ + hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + if (offload_core->held_skb) { + /* Reverse-engineer the location of the gro_node, so we + * can verify that held_skb is still valid. */ - if (skb_is_ipv6(held_skb)) - protocol = ipv6_hdr(held_skb)->nexthdr; - else - protocol = ip_hdr(held_skb)->protocol; - if (protocol != IPPROTO_HOMA) - continue; + struct gro_list *gro_list = container_of(held_list, + struct gro_list, list); +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 15, 0) + struct napi_struct *napi = container_of(gro_list, + struct napi_struct, gro_hash[hash]); +#else + struct gro_node *gro_node = container_of(gro_list, + struct gro_node, hash[hash]); +#endif - /* Aggregate skb into held_skb. We don't update the length of - * held_skb, because we'll eventually split it up and process - * each skb independently. + /* Must verify that offload_core->held_skb points to a packet on + * the list, and that the packet is a Homa packet. + * homa_gro_complete isn't always invoked before removing + * packets from the list, so offload_core->held_skb could be a + * dangling pointer (or the skb could have been reused for + * some other protocol). */ - if (NAPI_GRO_CB(held_skb)->last == held_skb) - skb_shinfo(held_skb)->frag_list = skb; - else - NAPI_GRO_CB(held_skb)->last->next = skb; - NAPI_GRO_CB(held_skb)->last = skb; - skb->next = NULL; - NAPI_GRO_CB(skb)->same_flow = 1; - NAPI_GRO_CB(held_skb)->count++; - h_held->gro_count++; - if (h_held->gro_count >= homa->max_gro_skbs) - result = pp; - goto done; + list_for_each_entry(held_skb, +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 15, 0) + &napi->gro_hash[offload_core->held_bucket].list, +#else + &gro_node->hash[offload_core->held_bucket].list, +#endif + list) { + int protocol; + + if (held_skb != offload_core->held_skb) + continue; + if (skb_is_ipv6(held_skb)) + protocol = ipv6_hdr(held_skb)->nexthdr; + else + protocol = ip_hdr(held_skb)->protocol; + if (protocol != IPPROTO_HOMA) { + tt_record3("homa_gro_receive held_skb 0x%0x%0x isn't Homa: protocol %d", + tt_hi(held_skb), tt_lo(held_skb), + protocol); + continue; + } + + /* Aggregate skb into held_skb. We don't update the + * length of held_skb because we'll eventually split + * it up and process each skb independently. + */ + if (NAPI_GRO_CB(held_skb)->last == held_skb) + skb_shinfo(held_skb)->frag_list = skb; + else + NAPI_GRO_CB(held_skb)->last->next = skb; + NAPI_GRO_CB(held_skb)->last = skb; + skb->next = NULL; + NAPI_GRO_CB(skb)->same_flow = 1; + NAPI_GRO_CB(held_skb)->count++; + if (NAPI_GRO_CB(held_skb)->count >= homa->max_gro_skbs) { + /* Push this batch up through the SoftIRQ + * layer. This code is a hack, needed because + * returning skb as result is no longer + * sufficient (as of 5.4.80) to push it up + * the stack; the packet just gets queued on + * gro_node->rx_list. This code basically steals + * the packet from dev_gro_receive and + * pushes it upward. + */ + skb_list_del_init(held_skb); + homa_gro_complete(held_skb, 0); + netif_receive_skb(held_skb); + homa_send_ipis(); +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 15, 0) + napi->gro_hash[offload_core->held_bucket].count--; + if (napi->gro_hash[offload_core->held_bucket].count == 0) + __clear_bit(offload_core->held_bucket, + &napi->gro_bitmask); +#else + gro_node->hash[offload_core->held_bucket].count--; + if (gro_node->hash[offload_core->held_bucket].count == 0) + __clear_bit(offload_core->held_bucket, + &gro_node->bitmask); +#endif + result = ERR_PTR(-EINPROGRESS); + } + goto done; + } } /* There was no existing Homa packet that this packet could be - * batched with, so this packet will now go on gro_list for future - * packets to be batched with. If the packet is sent up the stack - * before another packet arrives for batching, we want it to be - * processed on this same core (it's faster that way, and if - * batching doesn't occur it means we aren't heavily loaded; if - * batching does occur, homa_gro_complete will pick a different - * core). + * batched with, so this packet will become the new merge_skb. + * If the packet is sent up the stack before another packet + * arrives for batching, we want it to be processed on this same + * core (it's faster that way, and if batching doesn't occur it + * means we aren't heavily loaded; if batching does occur, + * homa_gro_complete will pick a different core). */ + offload_core->held_skb = skb; + offload_core->held_bucket = hash; if (likely(homa->gro_policy & HOMA_GRO_SAME_CORE)) homa_set_softirq_cpu(skb, smp_processor_id()); diff --git a/homa_offload.h b/homa_offload.h index 722c65c3..00983d98 100644 --- a/homa_offload.h +++ b/homa_offload.h @@ -79,7 +79,7 @@ void homa_gro_gen3(struct homa *homa, struct sk_buff *skb); void homa_gro_hook_tcp(void); void homa_gro_unhook_tcp(void); #endif /* See strip.py */ -struct sk_buff **homa_gro_receive(struct sk_buff **gro_list, +struct sk_buff *homa_gro_receive(struct list_head *gro_list, struct sk_buff *skb); struct sk_buff *homa_gso_segment(struct sk_buff *skb, netdev_features_t features); @@ -88,7 +88,7 @@ int homa_offload_init(void); void homa_send_ipis(void); void homa_set_softirq_cpu(struct sk_buff *skb, int cpu); #ifndef __STRIP__ /* See strip.py */ -struct sk_buff **homa_tcp_gro_receive(struct sk_buff **gro_list, +struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, struct sk_buff *skb); #endif /* See strip.py */ diff --git a/homa_peer.c b/homa_peer.c index a5d4c24e..3d4956bb 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -554,8 +554,10 @@ int homa_peer_reset_dst(struct homa_peer *peer, struct homa_sock *hsk) peer->flow.u.ip6.fl6_sport = 0; peer->flow.u.ip6.flowi6_uid = hsk->sock.sk_uid; security_sk_classify_flow(&hsk->sock, &peer->flow); - dst = ip6_dst_lookup_flow(&hsk->sock, &peer->flow.u.ip6, - &peer->addr); + dst = ip6_dst_lookup_flow(sock_net(&hsk->sock), &hsk->sock, + &peer->flow.u.ip6, + &peer->addr); + if (IS_ERR(dst)) { result = PTR_ERR(dst); INC_METRIC(peer_route_errors, 1); diff --git a/homa_plumbing.c b/homa_plumbing.c index 6c4b9e39..406da4a2 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -697,11 +697,13 @@ void __exit homa_unload(void) #ifndef __STRIP__ /* See strip.py */ homa_gro_unhook_tcp(); +#endif /* See strip.py */ if (timer_kthread) { timer_thread_exit = 1; wake_up_process(timer_kthread); wait_for_completion(&timer_thread_done); } +#ifndef __STRIP__ /* See strip.py */ homa_qdisc_unregister(); if (homa_offload_end() != 0) pr_err("Homa couldn't stop offloads\n"); @@ -1680,8 +1682,10 @@ int homa_softirq(struct sk_buff *skb) * the ICMP header (the first byte of the embedded packet IP header). * @skb: The incoming packet. * @info: Information about the error that occurred? + * + * Return: zero, or a negative errno if the error couldn't be handled here. */ -void homa_err_handler_v4(struct sk_buff *skb, u32 info) +int homa_err_handler_v4(struct sk_buff *skb, u32 info) { struct homa *homa = homa_net(dev_net(skb->dev))->homa; const struct icmphdr *icmp = icmp_hdr(skb); @@ -1711,6 +1715,8 @@ void homa_err_handler_v4(struct sk_buff *skb, u32 info) } if (error != 0) homa_abort_rpcs(homa, &daddr, port, error); + + return 0; } /** @@ -1723,8 +1729,10 @@ void homa_err_handler_v4(struct sk_buff *skb, u32 info) * @code: Additional information about the error. * @offset: Not used. * @info: Information about the error that occurred? + * + * Return: zero, or a negative errno if the error couldn't be handled here. */ -void homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, +int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; @@ -1745,6 +1753,8 @@ void homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, } if (error != 0) homa_abort_rpcs(homa, &iph->daddr, port, error); + + return 0; } /** @@ -1765,7 +1775,7 @@ __poll_t homa_poll(struct file *file, struct socket *sock, __poll_t mask; mask = 0; - sock_poll_wait(file, sk_sleep(sock->sk), wait); + sock_poll_wait(file, sock, wait); tt_record2("homa_poll found sk_wmem_alloc %d, sk_sndbuf %d", refcount_read(&hsk->sock.sk_wmem_alloc), hsk->sock.sk_sndbuf); From bd3ca9f697917b6cf00bb073c83cfddf38d04286 Mon Sep 17 00:00:00 2001 From: AK Date: Wed, 25 Mar 2026 15:14:04 -0700 Subject: [PATCH 4/6] Create new branch rhel_9.5 as a derivative of rhel8 --- homa_impl.h | 2 +- homa_metrics.c | 10 +++++----- homa_offload.c | 5 ++++- homa_outgoing.c | 9 +++++---- homa_peer.c | 6 ++++-- homa_plumbing.c | 9 ++++----- homa_skb.c | 8 ++++---- timetrace.c | 10 +++++----- 8 files changed, 32 insertions(+), 27 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index b5459836..0a4575ee 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -748,7 +748,7 @@ void homa_rpc_handoff(struct homa_rpc *rpc); int homa_rpc_tx_end(struct homa_rpc *rpc); int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int homa_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); + sockptr_t optval, unsigned int optlen); int homa_shutdown(struct socket *sock, int how); int homa_socket(struct sock *sk); int homa_softirq(struct sk_buff *skb); diff --git a/homa_metrics.c b/homa_metrics.c index fd0e82f4..676ba827 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -9,11 +9,11 @@ DEFINE_PER_CPU(struct homa_metrics, homa_metrics); /* Describes file operations implemented for /proc/net/homa_metrics. */ -static const struct file_operations homa_metrics_ops = { - .open = homa_metrics_open, - .read = homa_metrics_read, - .llseek = homa_metrics_lseek, - .release = homa_metrics_release, +static const struct proc_ops homa_metrics_ops = { + .proc_open = homa_metrics_open, + .proc_read = homa_metrics_read, + .proc_lseek = homa_metrics_lseek, + .proc_release = homa_metrics_release, }; /* Global information used to export metrics information through a file in diff --git a/homa_offload.c b/homa_offload.c index 9e38561a..edd542b6 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -9,6 +9,8 @@ #include "homa_pacer.h" #include "homa_qdisc.h" #include "homa_wire.h" +#include +#include DEFINE_PER_CPU(struct homa_offload_core, homa_offload_core); @@ -143,7 +145,8 @@ struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, struct homa_common_hdr *h = (struct homa_common_hdr *) skb_transport_header(skb); - // tt_record4("homa_tcp_gro_receive got type 0x%x, flags 0x%x, " + //tt_record2("Source address is 0x%x length %d", ntohl(ip_hdr(skb)->saddr), skb->len); + //tt_record4("homa_tcp_gro_receive got type 0x%x, flags 0x%x, " // "urgent 0x%x, id %d", h->type, h->flags, // ntohs(h->urgent), homa_local_id(h->sender_id)); if (h->flags != HOMA_TCP_FLAGS || diff --git a/homa_outgoing.c b/homa_outgoing.c index de2c5edc..eed4f44c 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -511,7 +511,8 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, if (hsk->inet.sk.sk_family == AF_INET6) { homa_set_hijack(skb, peer, true); result = ip6_xmit(&hsk->inet.sk, skb, &peer->flow.u.ip6, 0, - NULL, hsk->homa->priority_map[priority] << 5); + NULL, hsk->homa->priority_map[priority] << 5, + 0); } else { homa_set_hijack(skb, peer, false); @@ -534,7 +535,7 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, #else /* See strip.py */ if (hsk->inet.sk.sk_family == AF_INET6) result = ip6_xmit(&hsk->inet.sk, skb, &peer->flow.u.ip6, 0, - NULL, 0); + NULL, 0, 0); else result = ip_queue_xmit(&hsk->inet.sk, skb, &peer->flow); #endif /* See strip.py */ @@ -723,10 +724,10 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc) homa_set_hijack(skb, rpc->peer, true); err = ip6_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow.u.ip6, 0, NULL, - rpc->hsk->homa->priority_map[priority] << 5); + rpc->hsk->homa->priority_map[priority] << 5, 0); #else /* See strip.py */ ip6_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow.u.ip6, - 0, NULL, 0); + 0, NULL, 0, 0); #endif /* See strip.py */ } else { tt_record4("calling ip_queue_xmit: wire_bytes %d, peer 0x%x, id %d, offset %d", diff --git a/homa_peer.c b/homa_peer.c index 7831874b..b4ca75c9 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -519,7 +519,8 @@ int homa_peer_reset_dst(struct homa_peer *peer, struct homa_sock *hsk) ipv6_to_ipv4(peer->addr), hsk->inet.inet_saddr, 0, 0, hsk->sock.sk_uid); - security_sk_classify_flow(&hsk->sock, &peer->flow); + security_sk_classify_flow(&hsk->sock, + &peer->flow.u.__fl_common); rt = ip_route_output_flow(sock_net(&hsk->sock), &peer->flow.u.ip4, &hsk->sock); if (IS_ERR(rt)) { @@ -541,7 +542,8 @@ int homa_peer_reset_dst(struct homa_peer *peer, struct homa_sock *hsk) peer->flow.u.ip6.fl6_dport = 0; peer->flow.u.ip6.fl6_sport = 0; peer->flow.u.ip6.flowi6_uid = hsk->sock.sk_uid; - security_sk_classify_flow(&hsk->sock, &peer->flow); + security_sk_classify_flow(&hsk->sock, + &peer->flow.u.__fl_common); dst = ip6_dst_lookup_flow(sock_net(&hsk->sock), &hsk->sock, &peer->flow.u.ip6, &peer->addr); diff --git a/homa_plumbing.c b/homa_plumbing.c index a2653a23..f3ffa5ae 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -144,7 +144,6 @@ static struct net_protocol homa_protocol = { .handler = homa_softirq, .err_handler = homa_err_handler_v4, .no_policy = 1, - .netns_ok = 1, }; static struct inet6_protocol homav6_protocol = { @@ -992,7 +991,7 @@ int homa_socket(struct sock *sk) * on errors. */ int homa_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct homa_sock *hsk = homa_sk(sk); int ret; @@ -1013,7 +1012,7 @@ int homa_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; } - if (unlikely(copy_from_user(&args, optval, optlen))) { + if (unlikely(copy_from_sockptr(&args, optval, optlen))) { hsk->error_msg = "invalid address for homa_rcvbuf_args"; return -EFAULT; } @@ -1039,7 +1038,7 @@ int homa_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; } - if (unlikely(copy_from_user(&arg, optval, optlen))) { + if (unlikely(copy_from_sockptr(&arg, optval, optlen))) { hsk->error_msg = "invalid address for SO_HOMA_SERVER value"; return -EFAULT; } @@ -1998,7 +1997,7 @@ int homa_timer_main(void *transport) homa_timer(homa); } hrtimer_cancel(&hrtimer); - complete_and_exit(&timer_thread_done, 0); + kthread_complete_and_exit(&timer_thread_done, 0); return 0; } diff --git a/homa_skb.c b/homa_skb.c index 43d00c23..b9279430 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -204,7 +204,7 @@ void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) frag = &shinfo->frags[shinfo->nr_frags - 1]; if (skb_frag_page(frag) == skb_core->skb_page && skb_core->page_inuse < skb_core->page_size && - (frag->page_offset + skb_frag_size(frag)) == + (frag->bv_offset + skb_frag_size(frag)) == skb_core->page_inuse) { if ((skb_core->page_size - skb_core->page_inuse) < actual_size) @@ -234,7 +234,7 @@ void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) shinfo->nr_frags++; frag_page_set(frag, skb_core->skb_page); get_page(skb_core->skb_page); - frag->page_offset = skb_core->page_inuse; + frag->bv_offset = skb_core->page_inuse; *length = actual_size; skb_frag_size_set(frag, actual_size); result = page_address(skb_frag_page(frag)) + skb_core->page_inuse; @@ -432,7 +432,7 @@ int homa_skb_append_from_skb(struct homa *homa, struct sk_buff *dst_skb, dst_shinfo->nr_frags++; frag_page_set(dst_frag, skb_frag_page(src_frag)); get_page(skb_frag_page(src_frag)); - dst_frag->page_offset = src_frag->page_offset + dst_frag->bv_offset = src_frag->bv_offset + (offset - src_frag_offset); skb_frag_size_set(dst_frag, chunk_size); offset += chunk_size; @@ -582,7 +582,7 @@ void homa_skb_get(struct sk_buff *skb, void *dest, int offset, int length) if (chunk_size > length) chunk_size = length; memcpy(dst, page_address(skb_frag_page(frag)) + - frag->page_offset + (offset - frag_offset), + frag->bv_offset + (offset - frag_offset), chunk_size); offset += chunk_size; length -= chunk_size; diff --git a/timetrace.c b/timetrace.c index 702cf3e8..6c926c5d 100644 --- a/timetrace.c +++ b/timetrace.c @@ -50,11 +50,11 @@ struct tt_buffer *tt_buffers[NR_CPUS]; /* Describes file operations implemented for reading timetraces * from /proc. */ -static const struct file_operations tt_pops = { - .open = tt_proc_open, - .read = tt_proc_read, - .llseek = tt_proc_lseek, - .release = tt_proc_release +static const struct proc_ops tt_pops = { + .proc_open = tt_proc_open, + .proc_read = tt_proc_read, + .proc_lseek = tt_proc_lseek, + .proc_release = tt_proc_release }; /* Used to remove the /proc file during tt_destroy. */ From 690e7d57c334bd9e74172f9270d8a737dd88291d Mon Sep 17 00:00:00 2001 From: AK Date: Wed, 22 Apr 2026 22:30:19 -0500 Subject: [PATCH 5/6] Added Support for Hijack UDP --- UDP_HIJACK.md | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 UDP_HIJACK.md diff --git a/UDP_HIJACK.md b/UDP_HIJACK.md new file mode 100644 index 00000000..6ec23453 --- /dev/null +++ b/UDP_HIJACK.md @@ -0,0 +1,109 @@ +# UDP Hijacking for Homa + +## Overview + +UDP hijacking is an optional mechanism that encapsulates Homa packets as UDP +datagrams, using `IPPROTO_UDP` instead of `IPPROTO_HOMA` as the IP protocol. +It works alongside the existing TCP hijacking feature — only one can be active +at a time on a given socket. + +### Why UDP hijacking? + +TCP hijacking uses `SYN+RST` flag combinations that never occur in real TCP +traffic. However, some firewalls (particularly on virtualized environments) +inspect TCP flags and drop packets with these "impossible" flag combinations. +UDP hijacking avoids this issue entirely since UDP has no flags for firewalls +to inspect. + +### Trade-offs vs TCP hijacking + +| Feature | TCP hijacking | UDP hijacking | +|---------------------|------------------------|------------------------| +| NIC TSO support | Yes (multi-segment) | No (single-segment) | +| Firewall friendly | No (SYN+RST blocked) | Yes | +| GSO segments/packet | Multiple | 1 (`segs_per_gso = 1`) | +| IP protocol | `IPPROTO_TCP` | `IPPROTO_UDP` | +| sysctl | `hijack_tcp` | `hijack_udp` | + +Because NICs do not perform TSO on UDP packets the same way they do for TCP, +UDP hijacking forces `segs_per_gso = 1` (one segment per GSO packet). This +means each Homa data packet is sent individually rather than being batched +into large TSO super-packets. + +## Configuration + +Enable UDP hijacking at runtime via sysctl: + +```bash +# Enable UDP hijacking (disable TCP hijacking first if it was on) +sudo sysctl net.homa.hijack_tcp=0 +sudo sysctl net.homa.hijack_udp=1 +``` + +To switch back to TCP hijacking: + +```bash +sudo sysctl net.homa.hijack_udp=0 +sudo sysctl net.homa.hijack_tcp=1 +``` + +**Note:** If both `hijack_tcp` and `hijack_udp` are set, TCP hijacking takes +priority (sockets opened while both are set will use TCP). + +## How It Works + +### Sending (outgoing packets) + +1. **Socket initialization** (`homa_hijack_sock_init`): When a new Homa socket + is created, if `hijack_udp` is set the socket's `sk_protocol` is set to + `IPPROTO_UDP`. The kernel then transmits packets with a UDP IP protocol. + +2. **Header setup** (`homa_udp_hijack_set_hdr`): Before transmission, Homa + writes UDP-compatible header fields: + - `flags` is set to `HOMA_HIJACK_FLAGS` (6) — a marker value. + - `urgent` is set to `HOMA_HIJACK_URGENT` (0xb97d) — a second marker. + - Bytes 4-5 of the transport header are overwritten with the UDP length. + - Bytes 6-7 are set up for proper UDP checksum offload. + - Because the sequence field (bytes 4-7) is overwritten, the packet offset + is stored in `seg.offset` instead. + +3. **GSO geometry**: With UDP hijacking, `segs_per_gso` is forced to 1 (no + multi-segment GSO batching). + +### Receiving (incoming packets) + +1. **GRO interception** (`homa_udp_hijack_gro_receive`): Homa hooks into the + UDP GRO pipeline. When a UDP packet arrives, Homa checks: + - At least 20 bytes of transport header are available. + - `flags == HOMA_HIJACK_FLAGS` and `urgent == HOMA_HIJACK_URGENT`. + +2. If the packet is identified as a Homa-over-UDP packet, the IP protocol + is rewritten to `IPPROTO_HOMA` and the packet is handed to Homa's normal + GRO handler. Real UDP packets are passed through to the normal UDP stack. + +### Qdisc support + +The `is_homa_pkt()` function in `homa_qdisc.c` recognizes both TCP-hijacked +and UDP-hijacked packets, ensuring they receive proper Homa qdisc treatment. + +## Files Modified + +| File | Changes | +|-------------------|------------------------------------------------------------| +| `homa_wire.h` | No new defines needed (reuses `HOMA_HIJACK_FLAGS` and `HOMA_HIJACK_URGENT`) | +| `homa_impl.h` | Added `hijack_udp` field to `struct homa` | +| `homa_hijack.h` | Added `homa_udp_hijack_set_hdr()`, `homa_sock_udp_hijacked()`, `homa_skb_udp_hijacked()`; updated `homa_hijack_sock_init()` | +| `homa_hijack.c` | Added `homa_udp_hijack_init()`, `homa_udp_hijack_end()`, `homa_udp_hijack_gro_receive()` | +| `homa_outgoing.c` | Added `segs_per_gso=1` for UDP; added UDP header calls in xmit paths | +| `homa_plumbing.c` | Added `hijack_udp` sysctl; added UDP init/end calls | +| `homa_qdisc.c` | Added `IPPROTO_UDP` check in `is_homa_pkt()` | +| `util/homa_test.cc` | Added `udp_ping()`, `test_udp()`, "udp" test command | +| `util/server.cc` | Added `udp_server()` function | +| `util/cp_node.cc` | Added `udp_server` and `udp_client` classes, "udp" protocol option | + +## Key Constants + +| Constant | Value | Purpose | +|----------------------|----------|------------------------------------------------------| +| `HOMA_HIJACK_FLAGS` | 6 | Marker in the `flags` field (shared with TCP hijack) | +| `HOMA_HIJACK_URGENT` | 0xb97d | Marker in the `urgent` field (shared with TCP hijack)| From 0f1becc1c2aa565c0d1d90fff07f2df893d6fb80 Mon Sep 17 00:00:00 2001 From: AK Date: Wed, 22 Apr 2026 22:31:11 -0500 Subject: [PATCH 6/6] modified: homa_impl.h modified: homa_offload.c modified: homa_offload.h modified: homa_outgoing.c modified: homa_plumbing.c modified: homa_qdisc.c modified: homa_sock.c modified: homa_wire.h modified: util/cp_node.cc modified: util/homa_test.cc modified: util/server.cc --- homa_impl.h | 11 +- homa_offload.c | 89 ++++++++++++ homa_offload.h | 4 + homa_outgoing.c | 73 +++++++++- homa_plumbing.c | 9 ++ homa_qdisc.c | 19 ++- homa_sock.c | 2 + homa_wire.h | 18 ++- util/cp_node.cc | 358 +++++++++++++++++++++++++++++++++++++++++++++- util/homa_test.cc | 93 ++++++++++++ util/server.cc | 69 +++++++++ 11 files changed, 722 insertions(+), 23 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 0a4575ee..84a05a6c 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -358,6 +358,13 @@ struct homa { */ int hijack_tcp; + /** + * @hijack_udp: Non-zero means encapsulate outgoing Homa packets + * as UDP packets (i.e. use UDP as the IP protocol). Set externally + * via sysctl. + */ + int hijack_udp; + /** * @max_gro_skbs: Maximum number of socket buffers that can be * aggregated by the GRO mechanism. Set externally via sysctl. @@ -665,7 +672,9 @@ static inline bool is_homa_pkt(struct sk_buff *skb) ip_hdr(skb)->protocol; return (protocol == IPPROTO_HOMA || (protocol == IPPROTO_TCP && - tcp_hdr(skb)->urg_ptr == htons(HOMA_TCP_URGENT))); + tcp_hdr(skb)->urg_ptr == htons(HOMA_TCP_URGENT)) || + (protocol == IPPROTO_UDP && + tcp_hdr(skb)->urg_ptr == htons(HOMA_UDP_URGENT))); return protocol == IPPROTO_HOMA; } #endif /* See strip.py */ diff --git a/homa_offload.c b/homa_offload.c index edd542b6..75863a50 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -37,6 +37,19 @@ static const struct net_offload *tcp6_net_offload; */ static struct net_offload hook_tcp_net_offload; static struct net_offload hook_tcp6_net_offload; + +/* Pointers to UDP's net_offload structures. NULL means homa_gro_hook_udp + * hasn't been called yet. + */ +static const struct net_offload *udp_net_offload; +static const struct net_offload *udp6_net_offload; + +/* + * Identical to *udp_net_offload except that the gro_receive function + * has been replaced. + */ +static struct net_offload hook_udp_net_offload; +static struct net_offload hook_udp6_net_offload; #endif /* See strip.py */ /** @@ -167,6 +180,82 @@ struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, return homa_gro_receive(held_list, skb); } +/** + * homa_gro_hook_udp() - Arranges for UDP gro_receive calls to be + * mediated by this file, so that Homa-over-UDP packets can be retrieved + * and funneled through Homa. + */ +void homa_gro_hook_udp(void) +{ + if (udp_net_offload) + return; + + pr_notice("Homa setting up UDP hijacking\n"); + rcu_read_lock(); + udp_net_offload = rcu_dereference(inet_offloads[IPPROTO_UDP]); + hook_udp_net_offload = *udp_net_offload; + hook_udp_net_offload.callbacks.gro_receive = homa_udp_gro_receive; + inet_offloads[IPPROTO_UDP] = (struct net_offload __rcu *) + &hook_udp_net_offload; + + udp6_net_offload = rcu_dereference(inet6_offloads[IPPROTO_UDP]); + hook_udp6_net_offload = *udp6_net_offload; + hook_udp6_net_offload.callbacks.gro_receive = homa_udp_gro_receive; + inet6_offloads[IPPROTO_UDP] = (struct net_offload __rcu *) + &hook_udp6_net_offload; + rcu_read_unlock(); +} + +/** + * homa_gro_unhook_udp() - Reverses the effects of a previous call to + * homa_gro_hook_udp, so that UDP packets are now passed directly to + * UDP's gro_receive function without mediation. + */ +void homa_gro_unhook_udp(void) +{ + if (!udp_net_offload) + return; + pr_notice("Homa cancelling UDP hijacking\n"); + inet_offloads[IPPROTO_UDP] = (struct net_offload __rcu *) + udp_net_offload; + udp_net_offload = NULL; + inet6_offloads[IPPROTO_UDP] = (struct net_offload __rcu *) + udp6_net_offload; + udp6_net_offload = NULL; +} + +/** + * homa_udp_gro_receive() - Invoked instead of UDP's normal gro_receive + * function when hooking is enabled. Identifies Homa-over-UDP packets and + * passes them to Homa; sends real UDP packets to UDP's gro_receive function. + * @held_list: Pointer to header for list of packets that are being + * held for possible GRO merging. + * @skb: The newly arrived packet. + */ +struct sk_buff *homa_udp_gro_receive(struct list_head *held_list, + struct sk_buff *skb) +{ + struct homa_common_hdr *h = (struct homa_common_hdr *) + skb_transport_header(skb); + + if (h->flags != HOMA_UDP_FLAGS || + ntohs(h->urgent) != HOMA_UDP_URGENT) + return udp_net_offload->callbacks.gro_receive(held_list, skb); + + /* Change the packet's IP protocol to Homa so that it will get + * dispatched directly to Homa in the future. + */ + if (skb_is_ipv6(skb)) { + ipv6_hdr(skb)->nexthdr = IPPROTO_HOMA; + } else { + ip_hdr(skb)->check = ~csum16_add(csum16_sub(~ip_hdr(skb)->check, + htons(ip_hdr(skb)->protocol)), + htons(IPPROTO_HOMA)); + ip_hdr(skb)->protocol = IPPROTO_HOMA; + } + return homa_gro_receive(held_list, skb); +} + /** * homa_set_softirq_cpu() - Arrange for SoftIRQ processing of a packet to * occur on a specific core (creates a socket flow table entry for the core, diff --git a/homa_offload.h b/homa_offload.h index 00983d98..ee47b1ab 100644 --- a/homa_offload.h +++ b/homa_offload.h @@ -78,6 +78,8 @@ void homa_gro_gen3(struct homa *homa, struct sk_buff *skb); #ifndef __STRIP__ /* See strip.py */ void homa_gro_hook_tcp(void); void homa_gro_unhook_tcp(void); +void homa_gro_hook_udp(void); +void homa_gro_unhook_udp(void); #endif /* See strip.py */ struct sk_buff *homa_gro_receive(struct list_head *gro_list, struct sk_buff *skb); @@ -90,6 +92,8 @@ void homa_set_softirq_cpu(struct sk_buff *skb, int cpu); #ifndef __STRIP__ /* See strip.py */ struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, struct sk_buff *skb); +struct sk_buff *homa_udp_gro_receive(struct list_head *held_list, + struct sk_buff *skb); #endif /* See strip.py */ #endif /* _HOMA_OFFLOAD_H */ diff --git a/homa_outgoing.c b/homa_outgoing.c index eed4f44c..b280eafb 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -10,6 +10,7 @@ #include "homa_wire.h" #ifndef __STRIP__ /* See strip.py */ +#include #include "homa_pacer.h" #include "homa_qdisc.h" #include "homa_skb.h" @@ -49,6 +50,45 @@ static inline void homa_set_hijack(struct sk_buff *skb, struct homa_peer *peer, h->checksum = ~tcp_v4_check(skb->len, peer->flow.u.ip4.saddr, peer->flow.u.ip4.daddr, 0); } + +/** + * homa_set_udp_hijack() - Set fields in an outgoing Homa packet that are + * needed for UDP hijacking to work properly. Similar to homa_set_hijack() + * but uses IPPROTO_UDP for checksumming and writes a UDP length/checksum + * overlay at bytes 4-7 of the transport header. + * @skb: Packet buffer in which to set fields. + * @peer: Peer that contains source and destination addresses for the packet. + * @ipv6: True means the packet is going to be sent via IPv6; false means + * IPv4. + */ +static inline void homa_set_udp_hijack(struct sk_buff *skb, + struct homa_peer *peer, bool ipv6) +{ + struct homa_common_hdr *h; + __be16 *udp_len_csum; + + h = (struct homa_common_hdr *)skb_transport_header(skb); + h->flags = HOMA_UDP_FLAGS; + h->urgent = htons(HOMA_UDP_URGENT); + + /* Write UDP length and checksum at bytes 4-7 of transport header. */ + udp_len_csum = (__be16 *)(skb_transport_header(skb) + 4); + udp_len_csum[0] = htons(skb->len); /* UDP length */ + udp_len_csum[1] = 0; /* UDP checksum (0 = none) */ + + /* Arrange for proper UDP checksumming. */ + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct homa_common_hdr, checksum); + if (ipv6) + h->checksum = ~csum_ipv6_magic(&peer->flow.u.ip6.saddr, + &peer->flow.u.ip6.daddr, + skb->len, IPPROTO_UDP, 0); + else + h->checksum = ~udp_v4_check(skb->len, + peer->flow.u.ip4.saddr, + peer->flow.u.ip4.daddr, 0); +} #endif /* See strip.py */ /** @@ -204,7 +244,10 @@ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, IF_NO_STRIP(h->cutoff_version = rpc->peer->cutoff_version); h->retransmit = 0; #ifndef __STRIP__ /* See strip.py */ - h->seg.offset = htonl(-1); + if (hsk->sock.sk_protocol == IPPROTO_UDP) + h->seg.offset = htonl(offset); + else + h->seg.offset = htonl(-1); #else /* See strip.py */ h->seg.offset = htonl(offset); #endif /* See strip.py */ @@ -219,7 +262,8 @@ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, homa_info->rpc = rpc; #ifndef __STRIP__ /* See strip.py */ - if (segs > 1 && hsk->sock.sk_protocol != IPPROTO_TCP) { + if (segs > 1 && hsk->sock.sk_protocol != IPPROTO_TCP + && hsk->sock.sk_protocol != IPPROTO_UDP) { #else /* See strip.py */ if (segs > 1) { #endif /* See strip.py */ @@ -322,10 +366,13 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) * if no hijacking). */ if (rpc->hsk->sock.sk_protocol == IPPROTO_TCP) { - /* Hijacking */ + /* TCP Hijacking */ segs_per_gso = gso_size - rpc->hsk->ip_header_length - sizeof(struct homa_data_hdr); do_div(segs_per_gso, max_seg_data); + } else if (rpc->hsk->sock.sk_protocol == IPPROTO_UDP) { + /* UDP Hijacking: one segment per GSO */ + segs_per_gso = 1; } else { /* No hijacking */ segs_per_gso = gso_size - rpc->hsk->ip_header_length - @@ -509,12 +556,18 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, homa_set_doff(skb, length); #ifndef __STRIP__ /* See strip.py */ if (hsk->inet.sk.sk_family == AF_INET6) { - homa_set_hijack(skb, peer, true); + if (hsk->sock.sk_protocol == IPPROTO_UDP) + homa_set_udp_hijack(skb, peer, true); + else + homa_set_hijack(skb, peer, true); result = ip6_xmit(&hsk->inet.sk, skb, &peer->flow.u.ip6, 0, NULL, hsk->homa->priority_map[priority] << 5, 0); } else { - homa_set_hijack(skb, peer, false); + if (hsk->sock.sk_protocol == IPPROTO_UDP) + homa_set_udp_hijack(skb, peer, false); + else + homa_set_hijack(skb, peer, false); /* This will find its way to the DSCP field in the IPv4 hdr. */ hsk->inet.tos = hsk->homa->priority_map[priority] << 5; @@ -721,7 +774,10 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc) tt_addr(rpc->peer->addr), rpc->id, homa_get_skb_info(skb)->offset); #ifndef __STRIP__ /* See strip.py */ - homa_set_hijack(skb, rpc->peer, true); + if (rpc->hsk->sock.sk_protocol == IPPROTO_UDP) + homa_set_udp_hijack(skb, rpc->peer, true); + else + homa_set_hijack(skb, rpc->peer, true); err = ip6_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow.u.ip6, 0, NULL, rpc->hsk->homa->priority_map[priority] << 5, 0); @@ -736,7 +792,10 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc) homa_get_skb_info(skb)->offset); #ifndef __STRIP__ /* See strip.py */ - homa_set_hijack(skb, rpc->peer, false); + if (rpc->hsk->sock.sk_protocol == IPPROTO_UDP) + homa_set_udp_hijack(skb, rpc->peer, false); + else + homa_set_hijack(skb, rpc->peer, false); rpc->hsk->inet.tos = rpc->hsk->homa->priority_map[priority] << 5; err = ip_queue_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow); diff --git a/homa_plumbing.c b/homa_plumbing.c index f3ffa5ae..142bad99 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -257,6 +257,13 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_dointvec }, + { + .procname = "hijack_udp", + .data = OFFSET(hijack_udp), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_dointvec + }, { .procname = "link_mbps", .data = OFFSET(link_mbps), @@ -642,6 +649,7 @@ int __init homa_load(void) #ifndef __STRIP__ /* See strip.py */ homa_gro_hook_tcp(); + homa_gro_hook_udp(); #endif /* See strip.py */ #ifndef __UPSTREAM__ /* See strip.py */ tt_set_temp(homa->temp); @@ -695,6 +703,7 @@ void __exit homa_unload(void) #ifndef __STRIP__ /* See strip.py */ homa_gro_unhook_tcp(); + homa_gro_unhook_udp(); #endif /* See strip.py */ if (timer_kthread) { timer_thread_exit = 1; diff --git a/homa_qdisc.c b/homa_qdisc.c index 055acc69..b6af33b4 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -557,15 +557,17 @@ bool homa_qdisc_can_bypass(struct sk_buff *skb, struct homa_qdisc *q) bool result; int element; - /* Collect information from skb. If it isn't a TCP packet then + /* Collect information from skb. If it isn't a TCP or UDP packet then * reordering constraints are unknown so deny reordering. */ if (skb->protocol == htons(ETH_P_IP)) { - if (ip_hdr(skb)->protocol != IPPROTO_TCP) + if (ip_hdr(skb)->protocol != IPPROTO_TCP + && ip_hdr(skb)->protocol != IPPROTO_UDP) return false; daddr = ip_hdr(skb)->daddr; } else if (skb->protocol == htons(ETH_P_IPV6)) { - if (ipv6_hdr(skb)->nexthdr != IPPROTO_TCP) + if (ipv6_hdr(skb)->nexthdr != IPPROTO_TCP + && ipv6_hdr(skb)->nexthdr != IPPROTO_UDP) return false; daddr = ipv6_hdr(skb)->daddr.in6_u.u6_addr32[0] ^ ipv6_hdr(skb)->daddr.in6_u.u6_addr32[1] ^ @@ -591,11 +593,13 @@ bool homa_qdisc_can_bypass(struct sk_buff *skb, struct homa_qdisc *q) skb_queue_walk(&q->deferred_tcp, skb2) { element++; if (skb2->protocol == htons(ETH_P_IP)) { - if (ip_hdr(skb2)->protocol != IPPROTO_TCP) + if (ip_hdr(skb2)->protocol != IPPROTO_TCP + && ip_hdr(skb2)->protocol != IPPROTO_UDP) continue; daddr2 = ip_hdr(skb2)->daddr; } else if (skb2->protocol == htons(ETH_P_IPV6)) { - if (ipv6_hdr(skb2)->nexthdr != IPPROTO_TCP) + if (ipv6_hdr(skb2)->nexthdr != IPPROTO_TCP + && ipv6_hdr(skb2)->nexthdr != IPPROTO_UDP) continue; daddr2 = ipv6_hdr(skb2)->daddr.in6_u.u6_addr32[0] ^ ipv6_hdr(skb2)->daddr.in6_u.u6_addr32[1] ^ @@ -757,8 +761,9 @@ int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) pkt_len = qdisc_pkt_len(skb); homa_qdisc_update_link_idle(qdev, pkt_len, -1); - if (ip_hdr(skb)->protocol == IPPROTO_TCP) - tt_record_tcp("homa_qdisc_pacer requeued TCP packet from " + if (ip_hdr(skb)->protocol == IPPROTO_TCP + || ip_hdr(skb)->protocol == IPPROTO_UDP) + tt_record_tcp("homa_qdisc_pacer requeued TCP/UDP packet from " "0x%x to 0x%x, data bytes %d, seq/ack %u", skb, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); homa_qdisc_schedule_skb(skb, q->qdisc); diff --git a/homa_sock.c b/homa_sock.c index 8991ca83..b2eb332d 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -164,6 +164,8 @@ int homa_sock_init(struct homa_sock *hsk) #ifndef __STRIP__ /* See strip.py */ if (homa->hijack_tcp) hsk->sock.sk_protocol = IPPROTO_TCP; + else if (homa->hijack_udp) + hsk->sock.sk_protocol = IPPROTO_UDP; #endif /* See strip.py */ /* Do things requiring memory allocation before locking the socket, diff --git a/homa_wire.h b/homa_wire.h index 0cd160ec..cf1b898a 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -139,13 +139,15 @@ struct homa_common_hdr { #ifndef __STRIP__ /* See strip.py */ /** * @flags: Holds TCP flags such as URG, ACK, etc. The special value - * HOMA_TCP_FLAGS is stored here to distinguish Homa-over-TCP packets - * from real TCP packets. It includes the SYN and RST flags, - * which TCP would never use together; must not include URG or FIN - * (TSO will turn off FIN for all but the last segment). + * HOMA_TCP_FLAGS is stored here to distinguish Homa-over-TCP and + * Homa-over-UDP packets from real TCP/UDP packets. It includes the + * SYN and RST flags, which TCP would never use together; must not + * include URG or FIN (TSO will turn off FIN for all but the last + * segment). */ u8 flags; #define HOMA_TCP_FLAGS 6 +#define HOMA_UDP_FLAGS 5 #else /* See strip.py */ /** @reserved1: Not used (corresponds to TCP flags). */ u8 reserved1; @@ -168,12 +170,14 @@ struct homa_common_hdr { #ifndef __STRIP__ /* See strip.py */ /** * @urgent: occupies the same bytes as the urgent pointer in a TCP - * header. When Homa packets are transmitted over TCP, this has the - * special value HOMA_TCP_URGENT (which is set even though URG is - * not set) to indicate that the packet is actually a Homa packet. + * header. When Homa packets are transmitted over TCP or UDP, this + * has the special value HOMA_TCP_URGENT or HOMA_UDP_URGENT (set + * even though URG is not set) to indicate that the packet is + * actually a Homa packet. */ __be16 urgent; #define HOMA_TCP_URGENT 0xb97d +#define HOMA_UDP_URGENT 0xb97e #else /* See strip.py */ /** @reserved2: Not used (corresponds to TCP urgent field). */ __be16 reserved2; diff --git a/util/cp_node.cc b/util/cp_node.cc index 4665c3f2..9095aa0d 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -338,7 +338,7 @@ void print_help(const char *name) printf(" --ipv6 Use IPv6 instead of IPv4\n"); printf(" --pin All server threads will be restricted to run only\n" " on the givevn core\n"); - printf(" --protocol Transport protocol to use: homa or tcp (default: %s)\n", + printf(" --protocol Transport protocol to use: homa, tcp, or udp (default: %s)\n", protocol); printf(" --port-threads Number of server threads to service each port\n" " (default: %d)\n", @@ -2708,6 +2708,347 @@ void tcp_client::read(tcp_connection *connection, int pid) } } +/* ===================== UDP client and server ===================== */ + +/** + * class udp_server - Holds information about a single UDP server, + * which consists of a thread that handles requests on a given port. + */ +class udp_server { +public: + udp_server(int port, int id, int num_threads, + std::string& experiment); + ~udp_server(); + void server(int thread_id); + + /** @port: Port on which we listen. */ + int port; + + /** @id: Unique identifier for this server. */ + int id; + + /** @experiment: name of the experiment this server is running. */ + string experiment; + + /** @fd: File descriptor for the UDP socket. */ + int fd; + + /** @metrics: Performance statistics. Not owned by this class. */ + server_metrics *metrics; + + /** @threads: Background threads servicing this socket. */ + std::vector threads; + + /** @stop: True means background threads should exit. */ + bool stop; +}; + +/** @udp_servers: keeps track of all existing UDP servers. */ +std::vector udp_servers; + +/** + * udp_server::udp_server() - Constructor for udp_server objects. + * @port: Port number on which this server should listen. + * @id: Unique identifier for this server. + * @num_threads: Number of threads to service this socket. + * @experiment: Name of the experiment. + */ +udp_server::udp_server(int port, int id, int num_threads, + std::string& experiment) + : port(port) + , id(id) + , fd(-1) + , metrics() + , threads() + , stop(false) +{ + if (std::find(experiments.begin(), experiments.end(), experiment) + == experiments.end()) + experiments.emplace_back(experiment); + + fd = socket(inet_family, SOCK_DGRAM, 0); + if (fd == -1) { + log(NORMAL, "FATAL: couldn't open UDP server socket: %s\n", + strerror(errno)); + fatal(); + } + sockaddr_in_union addr; + if (inet_family == AF_INET) { + addr.in4.sin_family = AF_INET; + addr.in4.sin_port = htons(port); + addr.in4.sin_addr.s_addr = INADDR_ANY; + } else { + addr.in6.sin6_family = AF_INET6; + addr.in6.sin6_port = htons(port); + addr.in6.sin6_addr = in6addr_any; + } + if (bind(fd, &addr.sa, sizeof(addr)) == -1) { + log(NORMAL, "FATAL: couldn't bind UDP socket to port %d: %s\n", + port, strerror(errno)); + fatal(); + } + + metrics = new server_metrics(experiment); + ::metrics.push_back(metrics); + + for (int i = 0; i < num_threads; i++) + threads.emplace_back(&udp_server::server, this, i); + kfreeze_count = 0; +} + +/** + * udp_server::~udp_server() - Destructor for UDP servers. + */ +udp_server::~udp_server() +{ + stop = true; + shutdown(fd, SHUT_RDWR); + for (size_t i = 0; i < threads.size(); i++) + threads[i].join(); + close(fd); +} + +/** + * udp_server::server() - Handles incoming UDP requests. Invoked as top-level + * method in a thread. + * @thread_id: Unique id for this thread. + */ +void udp_server::server(int thread_id) +{ + char thread_name[50]; + char buffer[1000000]; + + snprintf(thread_name, sizeof(thread_name), "US%d.%d", id, thread_id); + time_trace::thread_buffer thread_buffer(thread_name); + int pid = syscall(__NR_gettid); + if (server_core >= 0) + pin_thread(server_core); + + while (!stop) { + sockaddr_in_union source; + socklen_t source_len = sizeof(source); + ssize_t length = recvfrom(fd, buffer, sizeof(buffer), 0, + &source.sa, &source_len); + if (length < 0) { + if (stop) + return; + if ((errno == EAGAIN) || (errno == EINTR)) + continue; + log(NORMAL, "FATAL: UDP recvfrom failed: %s\n", + strerror(errno)); + fatal(); + } + if (length < (ssize_t)sizeof(message_header)) + continue; + + message_header *header = + reinterpret_cast(buffer); + metrics->requests++; + metrics->bytes_in += header->length; + tt("Received UDP request, cid 0x%08x, id %u, length %d, " + "pid %d", header->cid, header->msg_id, + header->length, pid); + + if ((header->freeze) && !time_trace::frozen) { + tt("Freezing timetrace"); + time_trace::freeze(); + kfreeze(); + } + + /* Prepare and send response. */ + int resp_length = header->short_response ? 100 : header->length; + if (resp_length < (int)sizeof(message_header)) + resp_length = sizeof(message_header); + header->response = 1; + header->length = resp_length; + metrics->bytes_out += resp_length; + + ssize_t sent = sendto(fd, buffer, resp_length, 0, + &source.sa, source_len); + if (sent < 0) + log(NORMAL, "ERROR: UDP sendto failed: %s\n", + strerror(errno)); + tt("Sent UDP response, cid 0x%08x, id %u, length %d", + header->cid, header->msg_id, resp_length); + } +} + +/** + * class udp_client - Holds information about a single UDP client, + * which consists of one thread issuing requests and one thread receiving + * responses. + */ +class udp_client : public client { +public: + udp_client(int id, std::string& experiment); + virtual ~udp_client(); + void receiver(int id); + void sender(void); + + /** @fd: UDP socket file descriptor. */ + int fd; + + /** @stop: True means background threads should exit. */ + bool stop; + + /** @receiver_threads: threads that receive responses. */ + std::vector receiving_threads; + + /** + * @sending_thread: thread that sends requests. + */ + std::optional sending_thread; +}; + +/** + * udp_client::udp_client() - Constructor for udp_client objects. + * @id: Unique identifier for this client. + * @experiment: Name of experiment. + */ +udp_client::udp_client(int id, std::string& experiment) + : client(id, experiment) + , fd(-1) + , stop(false) + , receiving_threads() + , sending_thread() +{ + fd = socket(inet_family, SOCK_DGRAM, 0); + if (fd < 0) { + log(NORMAL, "FATAL: couldn't open UDP client socket: %s\n", + strerror(errno)); + fatal(); + } + + for (int i = 0; i < port_receivers; i++) + receiving_threads.emplace_back(&udp_client::receiver, this, i); + while (receivers_running < receiving_threads.size()) { + /* Wait for receivers to begin execution before starting + * the sender. + */ + } + sending_thread.emplace(&udp_client::sender, this); +} + +/** + * udp_client::~udp_client() - Destructor for udp_client objects. + */ +udp_client::~udp_client() +{ + stop = true; + shutdown(fd, SHUT_RDWR); + if (sending_thread) + sending_thread->join(); + for (std::thread& thread: receiving_threads) + thread.join(); + close(fd); + check_completion("udp"); +} + +/** + * udp_client::sender() - Invoked as the top-level method in a thread; + * invokes a pseudo-random stream of RPCs continuously. + */ +void udp_client::sender() +{ + char thread_name[50]; + char buffer[HOMA_MAX_MESSAGE_LENGTH]; + int pid = syscall(__NR_gettid); + + snprintf(thread_name, sizeof(thread_name), "C%d", id); + time_trace::thread_buffer thread_buffer(thread_name); + + uint64_t next_start = rdtsc(); + message_header *header = reinterpret_cast(buffer); + + while (1) { + uint64_t now; + int server; + int slot = get_rinfo(); + + while (1) { + if (stop) { + rinfos[slot].active = false; + return; + } + now = rdtsc(); + if ((now >= next_start) && + ((total_requests - total_responses) + < client_port_max)) + break; + } + + rinfos[slot].start_time = now; + server = server_dist(rand_gen); + header->length = length_dist(rand_gen); + if (header->length > HOMA_MAX_MESSAGE_LENGTH) + header->length = HOMA_MAX_MESSAGE_LENGTH; + if (header->length < (int)sizeof(message_header)) + header->length = sizeof(message_header); + rinfos[slot].request_length = header->length; + header->cid = server_conns[server]; + header->cid.client_port = id; + header->msg_id = slot; + header->freeze = freeze[header->cid.server]; + header->short_response = one_way; + header->response = 0; + tt("Sending UDP request, cid 0x%08x, id %u, length %d, " + "pid %d", header->cid, header->msg_id, + header->length, pid); + + ssize_t sent = sendto(fd, buffer, header->length, 0, + &server_addrs[server].sa, + sockaddr_size(&server_addrs[server].sa)); + if (sent < 0) { + log(NORMAL, "FATAL: error in UDP sendto: %s (request " + "length %d)\n", strerror(errno), + header->length); + fatal(); + } + requests[server]++; + total_requests++; + lag = now - next_start; + next_start += interval_dist(rand_gen) * cycles_per_second; + } +} + +/** + * udp_client::receiver() - Invoked as the top-level method in a thread + * that waits for UDP responses and logs statistics. + * @receiver_id: Id of this receiver. + */ +void udp_client::receiver(int receiver_id) +{ + char thread_name[50]; + char buffer[1000000]; + + snprintf(thread_name, sizeof(thread_name), "R%d.%d", id, receiver_id); + time_trace::thread_buffer thread_buffer(thread_name); + receivers_running++; + int pid = syscall(__NR_gettid); + + while (!stop) { + ssize_t length = recvfrom(fd, buffer, sizeof(buffer), + 0, NULL, NULL); + if (length < 0) { + if (stop) + return; + if ((errno == EAGAIN) || (errno == EINTR)) + continue; + log(NORMAL, "FATAL: UDP recvfrom failed in client: " + "%s\n", strerror(errno)); + fatal(); + } + if (length < (ssize_t)sizeof(message_header)) + continue; + uint64_t end_time = rdtsc(); + message_header *header = + reinterpret_cast(buffer); + record(end_time, header); + tt("Response for cid 0x%08x received by pid %d", + header->cid, pid); + } +} + /** * homa_info() - Use the HOMAIOCINFO ioctl to extract the status of a * Homa socket and print the information to the log. @@ -3173,6 +3514,10 @@ int client_cmd(std::vector &words) if (first_port == -1) first_port = 4000; clients.push_back(new homa_client(i, experiment)); + } else if (strcmp(protocol, "udp") == 0) { + if (first_port == -1) + first_port = 6000; + clients.push_back(new udp_client(i, experiment)); } else { if (first_port == -1) first_port = 5000; @@ -3454,6 +3799,14 @@ int server_cmd(std::vector &words) experiment); homa_servers.push_back(server); } + } else if (strcmp(protocol, "udp") == 0) { + if (first_port == -1) + first_port = 6000; + for (int i = 0; i < server_ports; i++) { + udp_server *server = new udp_server(first_port + i, + i, port_threads, experiment); + udp_servers.push_back(server); + } } else { if (first_port == -1) first_port = 5000; @@ -3492,6 +3845,9 @@ int stop_cmd(std::vector &words) for (tcp_server *server: tcp_servers) delete server; tcp_servers.clear(); + for (udp_server *server: udp_servers) + delete server; + udp_servers.clear(); last_per_server_rpcs.clear(); for (server_metrics *m: metrics) delete m; diff --git a/util/homa_test.cc b/util/homa_test.cc index 5546089f..3ca1b3e1 100644 --- a/util/homa_test.cc +++ b/util/homa_test.cc @@ -760,6 +760,97 @@ void test_tcp(char *server_name, int port) return; } +/** + * udp_ping() - Send a request on a UDP socket and wait for the + * corresponding response. + * @fd: File descriptor for a UDP socket. + * @dest: Destination address. + * @dest_len: Size of @dest. + * @request: Buffer containing the request message. + * @length: Length of the request message. + */ +void udp_ping(int fd, struct sockaddr *dest, socklen_t dest_len, + void *request, int length) +{ + char response[1000000]; + int *int_response = reinterpret_cast(response); + ssize_t sent, received; + + sent = sendto(fd, request, length, 0, dest, dest_len); + if (sent != length) { + printf("UDP sendto failed: %s\n", strerror(errno)); + exit(1); + } + received = recvfrom(fd, response, sizeof(response), 0, NULL, NULL); + if (received < 0) { + printf("UDP recvfrom failed: %s\n", strerror(errno)); + exit(1); + } + if (received < (ssize_t)(2 * sizeof(int))) + return; + if (received != int_response[1]) + printf("Expected %d bytes in UDP response, got %ld\n", + int_response[1], received); +} + +/** + * test_udp() - Measure round-trip time for an RPC sent via a UDP socket. + * @server_name: Name of the server machine. + * @port: Server port to connect to. + */ +void test_udp(char *server_name, int port) +{ + struct addrinfo hints; + struct addrinfo *matching_addresses; + struct sockaddr *dest; + socklen_t dest_len; + int status, i; + int buffer[250000]; + + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = inet_family; + hints.ai_socktype = SOCK_DGRAM; + status = getaddrinfo(server_name, "80", &hints, &matching_addresses); + if (status != 0) { + printf("Couldn't look up address for %s: %s\n", + server_name, gai_strerror(status)); + exit(1); + } + dest = matching_addresses->ai_addr; + ((struct sockaddr_in *) dest)->sin_port = htons(port); + dest_len = matching_addresses->ai_addrlen; + + int fd = socket(inet_family, SOCK_DGRAM, 0); + if (fd == -1) { + printf("Couldn't open UDP socket: %s\n", strerror(errno)); + exit(1); + } + + /* Warm up. */ + buffer[0] = length; + buffer[1] = length; + seed_buffer(&buffer[2], sizeof32(buffer) - 2*sizeof32(int), seed); + for (i = 0; i < 10; i++) + udp_ping(fd, dest, dest_len, buffer, length); + + uint64_t times[count+1]; + for (i = 0; i < count; i++) { + times[i] = rdtsc(); + udp_ping(fd, dest, dest_len, buffer, length); + } + times[count] = rdtsc(); + freeaddrinfo(matching_addresses); + + for (i = 0; i < count; i++) { + times[i] = times[i+1] - times[i]; + } + print_dist(times, count); + printf("Bandwidth at median: %.1f MB/sec\n", + 2.0*((double) length)/(to_seconds(times[count/2])*1e06)); + close(fd); + return; +} + /** * test_tcpstream() - Measure throughput of a TCP socket using --length as * the size of the buffer for each write system call. @@ -1158,6 +1249,8 @@ int main(int argc, char** argv) test_tcpstream(host, port); } else if (strcmp(argv[next_arg], "tmp") == 0) { test_tmp(fd, count); + } else if (strcmp(argv[next_arg], "udp") == 0) { + test_udp(host, port); } else if (strcmp(argv[next_arg], "udpclose") == 0) { test_udpclose(); } else if (strcmp(argv[next_arg], "wmem") == 0) { diff --git a/util/server.cc b/util/server.cc index a87d753d..658529d3 100644 --- a/util/server.cc +++ b/util/server.cc @@ -307,6 +307,72 @@ void tcp_server(int port) } } +/** + * udp_server() - Opens a UDP socket and handles all requests arriving on + * that socket. Each request is a datagram whose first word is the total + * message length and second word is the desired response length. + * @port: Port number on which to listen. + */ +void udp_server(int port) +{ + int fd; + char buffer[1000000]; + sockaddr_in_union addr; + sockaddr_in_union source; + socklen_t source_len; + + fd = socket(inet_family, SOCK_DGRAM, 0); + if (fd < 0) { + printf("Couldn't open UDP socket: %s\n", strerror(errno)); + return; + } + memset(&addr, 0, sizeof(addr)); + addr.in4.sin_family = inet_family; + addr.in4.sin_port = htons(port); + if (bind(fd, &addr.sa, sizeof(addr)) != 0) { + printf("Couldn't bind UDP socket to port %d: %s\n", port, + strerror(errno)); + return; + } + if (verbose) + printf("Successfully bound to UDP port %d\n", port); + + while (1) { + int *int_buffer = reinterpret_cast(buffer); + ssize_t length; + int resp_length; + + source_len = sizeof(source); + length = recvfrom(fd, buffer, sizeof(buffer), 0, + &source.sa, &source_len); + if (length < 0) { + printf("UDP recvfrom failed: %s\n", strerror(errno)); + continue; + } + if (length < (ssize_t)(2 * sizeof(int))) { + if (verbose) + printf("UDP message too short (%ld bytes) " + "from %s\n", length, + print_address(&source)); + continue; + } + resp_length = int_buffer[1]; + if (verbose) + printf("Received UDP message from %s with %ld bytes, " + "response length %d\n", + print_address(&source), length, resp_length); + if (resp_length <= 0) + continue; + if (resp_length > (int)sizeof(buffer)) + resp_length = sizeof(buffer); + /* Echo the header back so the client can match responses. */ + if (sendto(fd, buffer, resp_length, 0, + &source.sa, source_len) < 0) { + printf("UDP sendto failed: %s\n", strerror(errno)); + } + } +} + int main(int argc, char** argv) { int next_arg; int num_ports = 1; @@ -356,5 +422,8 @@ int main(int argc, char** argv) { thread.detach(); } + std::thread udp_thread(udp_server, port); + udp_thread.detach(); + tcp_server(port); }