diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 34330ca27359f4..b9ca45124c9ea8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1793,6 +1793,18 @@ struct rex_saved_states{ struct bpf_link *link; }; +struct rex_mem { + void *mem; + u32 total_page; +}; + +struct rex_saved_states{ + int cpu_id; + int loader_pid; + u64 unwinder_insn_off; + struct bpf_link *link; +}; + struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9b9e08a18b619f..15b58a94c1d1e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -992,9 +992,11 @@ enum bpf_cmd { BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, BPF_PROG_STREAM_READ_BY_FD, - BPF_PROG_ASSOC_STRUCT_OPS, BPF_PROG_LOAD_REX_BASE, BPF_PROG_LOAD_REX, + BPF_SCHED_EXT_ATTACH_REX, + BPF_SCHED_EXT_DETACH_REX, + BPF_PROG_ASSOC_STRUCT_OPS, BPF_PROG_TERMINATE, __MAX_BPF_CMD, }; @@ -1528,6 +1530,11 @@ struct rex_text_sym { const char __user *symbol; }; +struct rex_sched_ops_sym { + const char __user *name; + __u64 offset; +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -1955,6 +1962,17 @@ union bpf_attr { __u32 prog_fd; } prog_stream_read; + struct { /* BPF_SCHED_EXT_ATTACH_REX */ + __u32 base_prog_fd; + __aligned_u64 sched_ops_syms; /* ptr to rex_sched_ops_sym array */ + __u32 nr_sched_ops_syms; + __u32 timeout_ms; /* ops.timeout_ms, 0 = default */ + __u32 exit_dump_len; /* ops.exit_dump_len, 0 = default */ + __u32 pad; + __aligned_u64 ops_flags; /* SCX_OPS_* flags */ + char name[128]; /* ops.name; empty string = use base->aux->name */ + } sched_ext_attach; + struct { __u32 map_fd; __u32 prog_fd; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2627e4d385c705..429dee647d7d2e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3679,6 +3679,781 @@ static int rex_parse_text_syms(union bpf_attr *attr, u64 addr_start, return ret; } +#define MAX_PROG_SZ (8192 << 4) +static int bpf_prog_load_rex_base(union bpf_attr *attr, bpfptr_t uattr) +{ + enum bpf_prog_type type = attr->prog_type; + struct bpf_prog *prog, *dst_prog = NULL; + struct btf *attach_btf = NULL; + int err; + char license[128]; + bool is_gpl; + + void *mem; + Elf64_Phdr *phdr = NULL; + Elf64_Ehdr *ehdr = NULL; + Elf64_Addr e_entry; /* Program entry point */ + Elf64_Addr e_end; /* Highest memory address occupied */ + struct file *filp; + size_t ph_size; + Elf64_Addr plast_vaddr = 0; + Elf64_Half ph_i; + u64 addr_start = 0; + int *vm_size = NULL, *sec_off = NULL; + int total_vm = 0; + + if (CHECK_ATTR(BPF_PROG_LOAD)) + return -EINVAL; + if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | + BPF_F_ANY_ALIGNMENT | + BPF_F_TEST_STATE_FREQ | + BPF_F_SLEEPABLE | + BPF_F_TEST_RND_HI32)) + return -EINVAL; + + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && + (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && + !bpf_cap) + goto put_token; + + /* Intent here is for unprivileged_bpf_disabled to block BPF program + * creation for unprivileged users; other actions depend + * on fd availability and access to bpffs, so are dependent on + * object creation success. Even with unprivileged BPF disabled, + * capability checks are still carried out for these + * and other operations. + */ + if (sysctl_unprivileged_bpf_disabled && !bpf_cap) + goto put_token; + + if (attr->insn_cnt == 0 || + attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) { + err = -E2BIG; + goto put_token; + } + if (type != BPF_PROG_TYPE_SOCKET_FILTER && + type != BPF_PROG_TYPE_CGROUP_SKB && + !bpf_cap) + goto put_token; + + if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN)) + goto put_token; + if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON)) + goto put_token; + + /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog + * or btf, we need to check which one it is + */ + if (attr->attach_prog_fd) { + dst_prog = bpf_prog_get(attr->attach_prog_fd); + if (IS_ERR(dst_prog)) { + dst_prog = NULL; + attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); + if (IS_ERR(attach_btf)) { + err = -EINVAL; + goto put_token; + } + if (!btf_is_kernel(attach_btf)) { + /* attaching through specifying bpf_prog's BTF + * objects directly might be supported eventually + */ + btf_put(attach_btf); + err = -ENOTSUPP; + goto put_token; + } + } + } else if (attr->attach_btf_id) { + /* fall back to vmlinux BTF, if BTF type ID is specified */ + attach_btf = bpf_get_btf_vmlinux(); + if (IS_ERR(attach_btf)) { + err = PTR_ERR(attach_btf); + goto put_token; + } + if (!attach_btf) { + err = -EINVAL; + goto put_token; + } + btf_get(attach_btf); + } + + if (bpf_prog_load_check_attach(type, attr->expected_attach_type, + attach_btf, attr->attach_btf_id, + dst_prog)) { + if (dst_prog) + bpf_prog_put(dst_prog); + if (attach_btf) + btf_put(attach_btf); + err = -EINVAL; + goto put_token; + } + + /* plain bpf_prog allocation */ + prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); + if (!prog) { + if (dst_prog) + bpf_prog_put(dst_prog); + if (attach_btf) + btf_put(attach_btf); + err = -EINVAL; + goto put_token; + } + + prog->expected_attach_type = attr->expected_attach_type; + prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE); + prog->aux->attach_btf = attach_btf; + prog->aux->attach_btf_id = attr->attach_btf_id; + prog->aux->dst_prog = dst_prog; + prog->aux->dev_bound = !!attr->prog_ifindex; + prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; + + /* move token into prog->aux, reuse taken refcnt */ + prog->aux->token = token; + token = NULL; + + prog->aux->user = get_current_user(); + prog->len = attr->insn_cnt; + + err = -EFAULT; + if (copy_from_bpfptr(prog->insns, + make_bpfptr(attr->insns, uattr.is_kernel), + bpf_prog_insn_size(prog)) != 0) + goto free_prog; + /* copy eBPF program license from user space */ + if (strncpy_from_bpfptr(license, + make_bpfptr(attr->license, uattr.is_kernel), + sizeof(license) - 1) < 0) + goto free_prog; + license[sizeof(license) - 1] = 0; + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; + + if (attr->signature) { + err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel); + if (err) + goto free_prog; + } + + prog->orig_prog = NULL; + prog->jited = 0; + prog->no_bpf = 0; + + atomic64_set(&prog->aux->refcnt, 1); + + if (bpf_prog_is_dev_bound(prog->aux)) { + err = bpf_prog_dev_bound_init(prog, attr); + if (err) + goto free_prog; + } + + if (type == BPF_PROG_TYPE_EXT && dst_prog && + bpf_prog_is_dev_bound(dst_prog->aux)) { + err = bpf_prog_dev_bound_inherit(prog, dst_prog); + if (err) + goto free_prog; + } + + /* + * Bookkeeping for managing the program attachment chain. + * + * It might be tempting to set attach_tracing_prog flag at the attachment + * time, but this will not prevent from loading bunch of tracing prog + * first, then attach them one to another. + * + * The flag attach_tracing_prog is set for the whole program lifecycle, and + * doesn't have to be cleared in bpf_tracing_link_release, since tracing + * programs cannot change attachment target. + */ + if (type == BPF_PROG_TYPE_TRACING && dst_prog && + dst_prog->type == BPF_PROG_TYPE_TRACING) { + prog->aux->attach_tracing_prog = true; + } + + /* find program type: socket_filter vs tracing_filter */ + err = find_prog_type(type, prog); + if (err < 0) + goto free_prog; + + prog->aux->load_time = ktime_get_boottime_ns(); + err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, + sizeof(attr->prog_name)); + if (err < 0) + goto free_prog; + + err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel); + if (err) + goto free_prog_sec; + + /* run eBPF verifier */ + err = bpf_check(&prog, attr, uattr, uattr_size); + if (err < 0) + goto free_used_maps; + + prog = bpf_prog_select_runtime(prog, &err); + if (err < 0) + goto free_used_maps; + + err = bpf_prog_mark_insn_arrays_ready(prog); + if (err < 0) + goto free_used_maps; + + err = bpf_prog_alloc_id(prog); + if (err) + goto free_used_maps; + + /* Upon success of bpf_prog_alloc_id(), the BPF prog is + * effectively publicly exposed. However, retrieving via + * bpf_prog_get_fd_by_id() will take another reference, + * therefore it cannot be gone underneath us. + * + * Only for the time /after/ successful bpf_prog_new_fd() + * and before returning to userspace, we might just hold + * one reference and any parallel close on that fd could + * rip everything out. Hence, below notifications must + * happen before bpf_prog_new_fd(). + * + * Also, any failure handling from this point onwards must + * be using bpf_prog_put() given the program is exposed. + */ + bpf_prog_kallsyms_add(prog); + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); + bpf_audit_prog(prog, BPF_AUDIT_LOAD); + + err = bpf_prog_new_fd(prog); + if (err < 0) + bpf_prog_put(prog); + return err; + +free_used_maps: + /* In case we have subprogs, we need to wait for a grace + * period before we can tear down JIT memory since symbols + * are already exposed under kallsyms. + */ + __bpf_prog_put_noref(prog, prog->aux->real_func_cnt); + return err; + +free_prog_sec: + security_bpf_prog_free(prog); +free_prog: + free_uid(prog->aux->user); + if (prog->aux->attach_btf) + btf_put(prog->aux->attach_btf); + bpf_prog_free(prog); +put_token: + bpf_token_put(token); + return err; +} + +static int bpf_prog_load_rex(union bpf_attr *attr, bpfptr_t uattr) +{ + enum bpf_prog_type type = attr->prog_type; + struct bpf_prog *prog, *dst_prog = NULL; + struct btf *attach_btf = NULL; + int err; + char license[128]; /* we don't support this for now */ + bool is_gpl; + struct bpf_prog *base; + + if (CHECK_ATTR(BPF_PROG_LOAD)) + return -EINVAL; + + if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | + BPF_F_ANY_ALIGNMENT | + BPF_F_TEST_STATE_FREQ | + BPF_F_SLEEPABLE | + BPF_F_TEST_RND_HI32)) + return -EINVAL; + + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && + (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && + !bpf_capable()) + return -EPERM; + + /* copy eBPF program license from user space */ + if (strncpy_from_bpfptr(license, + make_bpfptr(attr->license, uattr.is_kernel), + sizeof(license) - 1) < 0) + return -EFAULT; + license[sizeof(license) - 1] = 0; + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + is_gpl = license_is_gpl_compatible(license); + + if (!bpf_capable()) + return -EPERM; + + if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + if (is_perfmon_prog_type(type) && !perfmon_capable()) + return -EPERM; + + /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog + * or btf, we need to check which one it is + */ + if (attr->attach_prog_fd) { + dst_prog = bpf_prog_get(attr->attach_prog_fd); + if (IS_ERR(dst_prog)) { + dst_prog = NULL; + attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); + if (IS_ERR(attach_btf)) + return -EINVAL; + if (!btf_is_kernel(attach_btf)) { + /* attaching through specifying bpf_prog's BTF + * objects directly might be supported eventually + */ + btf_put(attach_btf); + return -ENOTSUPP; + } + } + } else if (attr->attach_btf_id) { + /* fall back to vmlinux BTF, if BTF type ID is specified */ + attach_btf = bpf_get_btf_vmlinux(); + if (IS_ERR(attach_btf)) + return PTR_ERR(attach_btf); + if (!attach_btf) + return -EINVAL; + btf_get(attach_btf); + } + + bpf_prog_load_fixup_attach_type(attr); + if (bpf_prog_load_check_attach(type, attr->expected_attach_type, + attach_btf, attr->attach_btf_id, + dst_prog)) { + if (dst_prog) + bpf_prog_put(dst_prog); + if (attach_btf) + btf_put(attach_btf); + return -EINVAL; + } + + /* plain bpf_prog allocation */ + prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); + if (!prog) { + if (dst_prog) + bpf_prog_put(dst_prog); + if (attach_btf) + btf_put(attach_btf); + return -ENOMEM; + } + + prog->expected_attach_type = attr->expected_attach_type; + prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE); + prog->aux->attach_btf = attach_btf; + prog->aux->attach_btf_id = attr->attach_btf_id; + prog->aux->dst_prog = dst_prog; + prog->aux->offload_requested = !!attr->prog_ifindex; + + prog->aux->user = get_current_user(); + prog->len = attr->insn_cnt; + + err = -EFAULT; + + prog->orig_prog = NULL; + prog->jited = 1; + + atomic64_set(&prog->aux->refcnt, 1); + prog->gpl_compatible = is_gpl ? 1 : 0; + + if (bpf_prog_is_dev_bound(prog->aux)) { + err = bpf_prog_dev_bound_init(prog, attr); + if (err) + goto free_prog_sec; + } + + /* find program type: socket_filter vs tracing_filter */ + err = find_prog_type(type, prog); + if (err < 0) + goto free_prog_sec; + + prog->aux->load_time = ktime_get_boottime_ns(); + err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, + sizeof(attr->prog_name)); + if (err < 0) + goto free_prog_sec; + + prog->no_bpf = 1; + + /* This gets the refcnt */ + base = bpf_prog_get(attr->base_prog_fd); + if (IS_ERR(base)) { + err = PTR_ERR(base); + goto free_used_maps; + } + + prog->base = base; + + if (attr->prog_offset >= base->mem.total_page << PAGE_SHIFT) { + err = -EINVAL; + goto free_base; + } + + prog->bpf_func = (void *)((u64)base->mem.mem + attr->prog_offset); + + /* Rust unwinder offset */ + prog->saved_state->unwinder_insn_off = + (u64)base->mem.mem + (u64)attr->unwinder_insn_off; + prog->saved_state->loader_pid = task_pid_nr(current); + + err = bpf_prog_alloc_id(prog); + if (err) + goto free_base; + + /* Upon success of bpf_prog_alloc_id(), the BPF prog is + * effectively publicly exposed. However, retrieving via + * bpf_prog_get_fd_by_id() will take another reference, + * therefore it cannot be gone underneath us. + * + * Only for the time /after/ successful bpf_prog_new_fd() + * and before returning to userspace, we might just hold + * one reference and any parallel close on that fd could + * rip everything out. Hence, below notifications must + * happen before bpf_prog_new_fd(). + * + * Also, any failure handling from this point onwards must + * be using bpf_prog_put() given the program is exposed. + */ + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); + bpf_audit_prog(prog, BPF_AUDIT_LOAD); + + err = bpf_prog_new_fd(prog); + if (err < 0) + bpf_prog_put(prog); + return err; + +free_base: + prog->base = NULL; + bpf_prog_put(base); +free_used_maps: + /* In case we have subprogs, we need to wait for a grace + * period before we can tear down JIT memory since symbols + * are already exposed under kallsyms. + */ + __bpf_prog_put_noref(prog, prog->aux->func_cnt); + return err; +free_prog_sec: + free_uid(prog->aux->user); + security_bpf_prog_free(prog); +// free_prog: TODO: Needs to fix error path + if (prog->aux->attach_btf) + btf_put(prog->aux->attach_btf); + bpf_prog_free(prog); + return err; +} + +static unsigned int __rex_prog_empty(const void *ctx, + const struct bpf_insn *insn) +{ + return 0; +} + +/* + * Define EM_TARGET, EM_PAGE_SIZE and EI_DATA_TARGET for the architecture we + * are compiling on. + */ +#if defined(__x86_64__) +#define EM_TARGET EM_X86_64 +#define EM_PAGE_SIZE 0x1000 +#define EI_DATA_TARGET ELFDATA2LSB +#elif defined(__aarch64__) +#define EM_TARGET EM_AARCH64 +#define EM_PAGE_SIZE 0x1000 +#define EI_DATA_TARGET ELFDATA2LSB +#elif defined(__powerpc64__) +#define EM_TARGET EM_PPC64 +#define EM_PAGE_SIZE 0x10000 +#define EI_DATA_TARGET ELFDATA2MSB +#else +#error Unsupported target +#endif + +static bool ehdr_is_valid(const Elf64_Ehdr *hdr) +{ + /* + * 1. Validate that this is an ELF64 header we support. + * + * Note: e_ident[EI_OSABI] and e_ident[EI_ABIVERSION] are deliberately NOT + * checked as compilers do not provide a way to override this without + * building the entire toolchain from scratch. + */ + if (!(hdr->e_ident[EI_MAG0] == ELFMAG0 + && hdr->e_ident[EI_MAG1] == ELFMAG1 + && hdr->e_ident[EI_MAG2] == ELFMAG2 + && hdr->e_ident[EI_MAG3] == ELFMAG3 + && hdr->e_ident[EI_CLASS] == ELFCLASS64 + && hdr->e_ident[EI_DATA] == EI_DATA_TARGET + && hdr->e_version == EV_CURRENT)) + return false; + /* + * 2. Validate ELF64 header internal sizes match what we expect, and that + * at least one program header entry is present. + */ + if (hdr->e_ehsize != sizeof (Elf64_Ehdr)) + return false; + if (hdr->e_phnum < 1) + return false; + if (hdr->e_phentsize != sizeof (Elf64_Phdr)) + return false; + /* + * 3. Validate that this is an executable for our target architecture. + */ + if ((hdr->e_type != ET_EXEC) + && (hdr->e_type != ET_DYN)) /* DJW: PIE makes ET_DYN */ + return false; + if (hdr->e_machine != EM_TARGET) + return false; + + return true; +} + +/* + * Align (addr) down to (align) boundary. Returns 1 if (align) is not a + * non-zero power of 2. + */ +static int align_down(Elf64_Addr addr, Elf64_Xword align, + Elf64_Addr *out_result) +{ + if (align > 0 && (align & (align - 1)) == 0) { + *out_result = addr & -align; + return 0; + } + else + return 1; +} + +/* + * Align (addr) up to (align) boundary. Returns 1 if an overflow would occur or + * (align) is not a non-zero power of 2, otherwise result in (*out_result) and + * 0. + */ +static int align_up(Elf64_Addr addr, Elf64_Xword align, Elf64_Addr *out_result) +{ + Elf64_Addr result; + + if (align > 0 && (align & (align - 1)) == 0) { + if (check_add_overflow(addr, (align - 1), &result)) + return 1; + result = result & -align; + *out_result = result; + return 0; + } + else + return 1; +} + +static int elf_read(struct file *file, void *buf, size_t len, loff_t pos) +{ + ssize_t rv; + + rv = kernel_read(file, buf, len, &pos); + if (unlikely(rv != len)) { + return (rv < 0) ? rv : -EIO; + } + return 0; +} + +static int rex_parse_maps(union bpf_attr *attr, struct bpf_prog *prog, + u64 addr_start) +{ + u64 map_offs[MAX_USED_MAPS]; + struct bpf_map **used_maps; + int idx, ret = 0; + + if (attr->map_cnt >= MAX_USED_MAPS) + return -EINVAL; + + if (copy_from_bpfptr(map_offs, USER_BPFPTR((void *)(attr->map_offs)), + sizeof(u64) * attr->map_cnt) != 0) + return -EFAULT; + + used_maps = kmalloc(sizeof(*used_maps) * attr->map_cnt, GFP_KERNEL); + if (!used_maps) + return -ENOMEM; + + for (idx = 0; idx < attr->map_cnt; idx++) { + u64 *map_addr = (u64 *)(addr_start + map_offs[idx]); + struct bpf_map *curr = bpf_map_get(*map_addr); + unsigned int level; + pte_t *pte = lookup_address((unsigned long)map_addr, &level); + bool is_ro = !pte_write(*pte); + unsigned long start = (unsigned long)map_addr & PAGE_MASK; + unsigned long end = ((unsigned long)map_addr + sizeof(curr)) & + PAGE_MASK; + int nr_pages = start == end ? 1 : 2; + + if (IS_ERR(curr)) { + ret = PTR_ERR(curr); + goto free_used_maps; + } + + used_maps[idx] = curr; + + /* Maps might (or will always?) be in .data, which is read-only */ + if (is_ro) + set_memory_rw(start, nr_pages); + *map_addr = (u64)curr; + if (is_ro) + set_memory_ro(start, nr_pages); + } + prog->aux->used_maps = used_maps; + prog->aux->used_map_cnt = attr->map_cnt; + + return 0; + +free_used_maps: + kfree(used_maps); + return ret; +} + +static int rex_parse_relas(union bpf_attr *attr, u64 addr_start) +{ + int i = 0; + int ret = 0; + u64 relas_size = attr->nr_dyn_relas * sizeof(struct rex_rela_dyn); + struct rex_rela_dyn *relas = kmalloc_array(attr->nr_dyn_relas, + sizeof(*relas), GFP_KERNEL); + + if (!relas) + return -ENOMEM; + + if (copy_from_bpfptr(relas, USER_BPFPTR((void *)(attr->dyn_relas)), + relas_size) != 0) { + ret = -EFAULT; + goto free_relas; + } + + for (i = 0; i < attr->nr_dyn_relas; i++) { + u64 *abs_addr; + + if (ELF64_R_TYPE(relas[i].info) != R_X86_64_RELATIVE) { + ret = -EINVAL; + goto free_relas; + } + + abs_addr = (u64 *)(addr_start + relas[i].offset); + *abs_addr = addr_start + relas[i].addend; + } + +free_relas: + kfree(relas); + return ret; +} + +static int rex_parse_dyn_syms(union bpf_attr *attr, u64 addr_start, struct bpf_prog *prog) +{ + int i = 0, ret = 0; + u64 syms_size = attr->nr_dyn_syms * sizeof(struct rex_dyn_sym); + struct rex_dyn_sym *syms = kmalloc_array(attr->nr_dyn_syms, + sizeof(*syms), GFP_KERNEL); + char name[KSYM_NAME_LEN] = { 0 }; + + if (!syms) + return -ENOMEM; + + if (copy_from_bpfptr(syms, USER_BPFPTR((void *)attr->dyn_syms), + syms_size) != 0) { + ret = -EFAULT; + goto free_syms; + } + + for (i = 0; i < attr->nr_dyn_syms; i++) { + u64 *abs_addr = (u64 *)(addr_start + syms[i].offset); + u64 sym_addr; + + memset(name, 0, KSYM_NAME_LEN); + ret = strncpy_from_user(name, syms[i].symbol, KSYM_NAME_LEN); + if (ret == KSYM_NAME_LEN) + ret = -E2BIG; + if (ret < 0) + goto free_syms; + + sym_addr = kallsyms_lookup_name(name); + if (!sym_addr) { + ret = -EINVAL; + goto free_syms; + } + + /* A better way is to create a dedicated kprobe program type that can + * override return values */ + if (IS_ENABLED(CONFIG_BPF_KPROBE_OVERRIDE)) { + extern void just_return_func(void); + if (sym_addr == (u64)just_return_func) + prog->kprobe_override = 1; + } + + *abs_addr = sym_addr; + } + + ret = 0; + +free_syms: + kfree(syms); + return ret; +} + +static int rex_parse_text_syms(union bpf_attr *attr, u64 addr_start, + struct bpf_prog *prog) +{ + int ret = 0; + u64 syms_size = attr->nr_text_syms * sizeof(struct rex_text_sym); + char name[KSYM_NAME_LEN] = { 0 }; + struct rex_text_sym *text_syms = kmalloc_array( + attr->nr_text_syms, sizeof(*text_syms), GFP_KERNEL); + struct bpf_ksym *ksyms; + + if (!text_syms) + return -ENOMEM; + + ksyms = kmalloc_array(attr->nr_text_syms, sizeof(*ksyms), + GFP_KERNEL | __GFP_ZERO); + if (!ksyms) { + ret = -ENOMEM; + goto free_text_syms; + } + + if (copy_from_bpfptr(text_syms, USER_BPFPTR((void *)attr->text_syms), + syms_size) != 0) { + ret = -EFAULT; + goto free_ksyms; + } + + for (int i = 0; i < attr->nr_text_syms; i++) { + u64 abs_addr = addr_start + text_syms[i].offset; + char *sym = ksyms[i].name; + const char *end = sym + KSYM_NAME_LEN; + + memset(name, 0, KSYM_NAME_LEN); + ret = strncpy_from_user(name, text_syms[i].symbol, + KSYM_NAME_LEN); + if (ret == KSYM_NAME_LEN) + ret = -E2BIG; + if (ret < 0) + goto free_ksyms; + + ksyms[i].prog = true; + ksyms[i].start = abs_addr; + ksyms[i].end = abs_addr + text_syms[i].size; + + sym += snprintf(sym, KSYM_NAME_LEN, "rex_prog_"); + sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); + snprintf(sym, (size_t)(end - sym), "::%s", name); + + INIT_LIST_HEAD(&ksyms[i].lnode); + } + + prog->aux->rex_syms = ksyms; + prog->aux->nr_syms = attr->nr_text_syms; + ret = 0; + + /* Don't free ksyms on success as we have already given away ownership */ + goto free_text_syms; + +free_ksyms: + kfree(ksyms); +free_text_syms: + kfree(text_syms); + return ret; +} + #define MAX_PROG_SZ (8192 << 4) static int bpf_prog_load_rex_base(union bpf_attr *attr, bpfptr_t uattr) { @@ -4103,6 +4878,78 @@ static int bpf_prog_load_rex_base(union bpf_attr *attr, bpfptr_t uattr) return err; } +extern int scx_enable_rex(struct bpf_prog *base, + struct rex_sched_ops_sym __user *usyms, u32 nr_syms, + u64 ops_flags, u32 timeout_ms, u32 exit_dump_len, + const char *user_name); +extern int scx_disable_rex(void); + +static int bpf_sched_ext_attach_rex(union bpf_attr *attr, bpfptr_t uattr) +{ + struct bpf_prog *base; + int err; + + pr_info("bpf_syscall: BPF_SCHED_EXT_ATTACH_REX called (fd=%u, nr_syms=%u)\n", + attr->sched_ext_attach.base_prog_fd, + attr->sched_ext_attach.nr_sched_ops_syms); + + if (!bpf_capable()) + return -EPERM; + + if (!attr->sched_ext_attach.base_prog_fd || + !attr->sched_ext_attach.sched_ops_syms || + !attr->sched_ext_attach.nr_sched_ops_syms) + return -EINVAL; + + base = bpf_prog_get(attr->sched_ext_attach.base_prog_fd); + if (IS_ERR(base)) + return PTR_ERR(base); + + if (base->type != BPF_PROG_TYPE_REX_BASE) { + err = -EINVAL; + goto put_prog; + } + + pr_info("bpf_syscall: Rex base prog verified, forwarding to scx_enable_rex()\n"); + + /* Ensure the user-provided name is NUL-terminated before handing it + * to scx_enable_rex(). The UAPI struct is a char[128]; treat an empty + * first byte as "no name provided, fall back to base->aux->name". */ + attr->sched_ext_attach.name[sizeof(attr->sched_ext_attach.name) - 1] = '\0'; + + err = scx_enable_rex(base, + u64_to_user_ptr(attr->sched_ext_attach.sched_ops_syms), + attr->sched_ext_attach.nr_sched_ops_syms, + attr->sched_ext_attach.ops_flags, + attr->sched_ext_attach.timeout_ms, + attr->sched_ext_attach.exit_dump_len, + attr->sched_ext_attach.name); + if (err) + goto put_prog; + + pr_info("bpf_syscall: BPF_SCHED_EXT_ATTACH_REX succeeded\n"); + + /* + * Keep the bpf_prog_get() reference: the scheduler callbacks point + * into base->mem.mem which must stay alive. scx_enable_rex() saved + * the pointer; scx_disable_rex() will release it on detach. + */ + return 0; + +put_prog: + bpf_prog_put(base); + return err; +} + +static int bpf_sched_ext_detach_rex(void) +{ + pr_info("bpf_syscall: BPF_SCHED_EXT_DETACH_REX called\n"); + + if (!bpf_capable()) + return -EPERM; + + return scx_disable_rex(); +} #define BPF_OBJ_LAST_FIELD path_fd @@ -7214,6 +8061,12 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) case BPF_PROG_LOAD_REX: err = bpf_prog_load_rex(&attr, uattr); break; + case BPF_SCHED_EXT_ATTACH_REX: + err = bpf_sched_ext_attach_rex(&attr, uattr); + break; + case BPF_SCHED_EXT_DETACH_REX: + err = bpf_sched_ext_detach_rex(); + break; case BPF_OBJ_PIN: err = bpf_obj_pin(&attr); break; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5d917b2ababe22..d2b4d330a37319 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2531,12 +2531,6 @@ static int migration_cpu_stop(void *data) */ flush_smp_call_function_queue(task_pt_regs(current)); - /* - * We may change the underlying rq, but the locks held will - * appropriately be "transferred" when switching. - */ - context_unsafe_alias(rq); - raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 064eaa76be4b9f..9c21aaa175fec1 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2831,9 +2831,13 @@ static void scx_watchdog_workfn(struct work_struct *work) WRITE_ONCE(scx_watchdog_timestamp, jiffies); + pr_info_ratelimited("sched_ext: watchdog tick (checking all CPUs)\n"); + for_each_online_cpu(cpu) { - if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) + if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) { + pr_warn("sched_ext: watchdog detected timeout on CPU %d!\n", cpu); break; + } cond_resched(); } @@ -4355,6 +4359,9 @@ static void free_kick_syncs(void) } } +static DEFINE_MUTEX(scx_rex_mutex); +static struct bpf_prog *scx_rex_base_prog; + static void scx_disable_workfn(struct kthread_work *work) { struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); @@ -4498,6 +4505,26 @@ static void scx_disable_workfn(struct kthread_work *work) mutex_unlock(&scx_enable_mutex); + /* + * Drop refs taken when this sched was attached via the Rex syscall + * path. The struct_ops path drops its equivalents via bpf_scx_unreg(); + * Rex has no link, so we do it here so cleanup fires for every disable + * kind (UNREG, ERROR_STALL, watchdog, sysrq, ...). + */ + if (sch->rex_base) { + struct bpf_prog *base = sch->rex_base; + + sch->rex_base = NULL; + + mutex_lock(&scx_rex_mutex); + if (scx_rex_base_prog == base) + scx_rex_base_prog = NULL; + mutex_unlock(&scx_rex_mutex); + + kobject_put(&sch->kobj); + bpf_prog_put(base); + } + WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); done: scx_bypass(false); @@ -5100,6 +5127,8 @@ static void scx_enable_workfn(struct kthread_work *work) if (WARN_ON_ONCE(READ_ONCE(scx_aborting))) WRITE_ONCE(scx_aborting, false); + pr_info("sched_ext: [1/6] state -> SCX_ENABLING\n"); + atomic_long_set(&scx_nr_rejected, 0); for_each_possible_cpu(cpu) @@ -5120,6 +5149,7 @@ static void scx_enable_workfn(struct kthread_work *work) scx_idle_enable(ops); if (sch->ops.init) { + pr_info("sched_ext: [2/6] calling ops.init() ...\n"); ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL); if (ret) { ret = ops_sanitize_err(sch, "init", ret); @@ -5128,6 +5158,7 @@ static void scx_enable_workfn(struct kthread_work *work) goto err_disable; } sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; + pr_info("sched_ext: [2/6] ops.init() returned successfully\n"); } for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) @@ -5165,7 +5196,9 @@ static void scx_enable_workfn(struct kthread_work *work) WRITE_ONCE(scx_watchdog_timeout, timeout); WRITE_ONCE(scx_watchdog_timestamp, jiffies); queue_delayed_work(system_unbound_wq, &scx_watchdog_work, - READ_ONCE(scx_watchdog_timeout) / 2); + scx_watchdog_timeout / 2); + pr_info("sched_ext: [3/6] watchdog armed (timeout=%lu ms)\n", + jiffies_to_msecs(timeout)); /* * Once __scx_enabled is set, %current can be switched to SCX anytime. @@ -5191,6 +5224,7 @@ static void scx_enable_workfn(struct kthread_work *work) WARN_ON_ONCE(scx_init_task_enabled); scx_init_task_enabled = true; + pr_info("sched_ext: [4/6] initializing all existing tasks for SCX ...\n"); /* * Enable ops for every task. Fork is excluded by scx_fork_rwsem @@ -5245,6 +5279,7 @@ static void scx_enable_workfn(struct kthread_work *work) */ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); static_branch_enable(&__scx_enabled); + pr_info("sched_ext: [5/6] scx_enabled=true, switching all tasks to SCX class ...\n"); /* * We're fully committed and can't fail. The task READY -> ENABLED @@ -5283,6 +5318,7 @@ static void scx_enable_workfn(struct kthread_work *work) if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) static_branch_enable(&__scx_switched_all); + pr_info("sched_ext: [6/6] state -> SCX_ENABLED. Scheduler takeover COMPLETE!\n"); pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", sch->ops.name, scx_switched_all() ? "" : " (partial)"); kobject_uevent(&sch->kobj, KOBJ_ADD); @@ -5485,6 +5521,192 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link) return scx_enable(kdata, link); } +int scx_enable_rex(struct bpf_prog *base, + struct rex_sched_ops_sym __user *usyms, u32 nr_syms, + u64 ops_flags, u32 timeout_ms, u32 exit_dump_len, + const char *user_name) +{ + struct sched_ext_ops *ops; + struct rex_sched_ops_sym *syms; + struct scx_sched *sch; + char name_buf[128]; + u32 i; + int err; + + pr_info("sched_ext_rex: === REX ENABLE START === nr_syms=%u\n", nr_syms); + + if (nr_syms > 64) + return -EINVAL; + + syms = kvmalloc_array(nr_syms, sizeof(*syms), GFP_KERNEL); + if (!syms) + return -ENOMEM; + + if (copy_from_user(syms, usyms, nr_syms * sizeof(*syms))) { + err = -EFAULT; + goto free_syms; + } + + ops = kzalloc(sizeof(*ops), GFP_KERNEL); + if (!ops) { + err = -ENOMEM; + goto free_syms; + } + + for (i = 0; i < nr_syms; i++) { + void *fn; + long name_len; + bool matched; + + name_len = strncpy_from_user(name_buf, syms[i].name, + sizeof(name_buf)); + if (name_len <= 0 || name_len >= sizeof(name_buf)) { + err = -EFAULT; + goto free_ops; + } + + if (syms[i].offset >= (u64)base->mem.total_page << PAGE_SHIFT) { + err = -EINVAL; + goto free_ops; + } + + fn = (void *)((u64)base->mem.mem + syms[i].offset); + matched = false; + +#define SCX_OP_MATCH(field) \ + do { if (!strcmp(name_buf, #field)) { ops->field = fn; matched = true; } } while (0) + + if (!matched) SCX_OP_MATCH(select_cpu); + if (!matched) SCX_OP_MATCH(enqueue); + if (!matched) SCX_OP_MATCH(dequeue); + if (!matched) SCX_OP_MATCH(dispatch); + if (!matched) SCX_OP_MATCH(tick); + if (!matched) SCX_OP_MATCH(runnable); + if (!matched) SCX_OP_MATCH(running); + if (!matched) SCX_OP_MATCH(stopping); + if (!matched) SCX_OP_MATCH(quiescent); + if (!matched) SCX_OP_MATCH(yield); + if (!matched) SCX_OP_MATCH(core_sched_before); + if (!matched) SCX_OP_MATCH(set_weight); + if (!matched) SCX_OP_MATCH(set_cpumask); + if (!matched) SCX_OP_MATCH(update_idle); + if (!matched) SCX_OP_MATCH(cpu_acquire); + if (!matched) SCX_OP_MATCH(cpu_release); + if (!matched) SCX_OP_MATCH(init_task); + if (!matched) SCX_OP_MATCH(exit_task); + if (!matched) SCX_OP_MATCH(enable); + if (!matched) SCX_OP_MATCH(disable); + if (!matched) SCX_OP_MATCH(dump); + if (!matched) SCX_OP_MATCH(dump_cpu); + if (!matched) SCX_OP_MATCH(dump_task); +#ifdef CONFIG_EXT_GROUP_SCHED + if (!matched) SCX_OP_MATCH(cgroup_init); + if (!matched) SCX_OP_MATCH(cgroup_exit); + if (!matched) SCX_OP_MATCH(cgroup_prep_move); + if (!matched) SCX_OP_MATCH(cgroup_move); + if (!matched) SCX_OP_MATCH(cgroup_cancel_move); + if (!matched) SCX_OP_MATCH(cgroup_set_weight); + if (!matched) SCX_OP_MATCH(cgroup_set_bandwidth); + if (!matched) SCX_OP_MATCH(cgroup_set_idle); +#endif + if (!matched) SCX_OP_MATCH(cpu_online); + if (!matched) SCX_OP_MATCH(cpu_offline); + if (!matched) SCX_OP_MATCH(init); + if (!matched) SCX_OP_MATCH(exit); + +#undef SCX_OP_MATCH + + if (!matched) { + pr_err("sched_ext_rex: unknown callback \"%s\"\n", + name_buf); + err = -EINVAL; + goto free_ops; + } + pr_info("sched_ext_rex: matched callback \"%s\" at offset 0x%llx\n", + name_buf, syms[i].offset); + } + + ops->flags = ops_flags; + ops->timeout_ms = timeout_ms; + ops->exit_dump_len = exit_dump_len; + + /* + * Prefer the user-supplied scheduler name (from SchedExtOps::name in + * the Rust program's .struct_ops). Fall back to the base program's + * name when the caller didn't provide one (empty string) -- this + * preserves the pre-fix behaviour for older loaders. + */ + if (user_name && user_name[0] != '\0') + strscpy(ops->name, user_name, sizeof(ops->name)); + else + strscpy(ops->name, base->aux->name, sizeof(ops->name)); + pr_info("sched_ext_rex: all %u callbacks matched, calling scx_enable(\"%s\")\n", + nr_syms, ops->name); + + mutex_lock(&scx_rex_mutex); + + err = scx_enable(ops, NULL); + if (err) { + pr_err("sched_ext_rex: scx_enable() FAILED err=%d\n", err); + mutex_unlock(&scx_rex_mutex); + goto free_ops; + } + + /* + * Hand the bpf_prog_get() reference taken in bpf_sched_ext_attach_rex() + * over to the scx_sched. scx_disable_workfn() will release it for any + * disable kind, so we don't depend on userspace detach running. + */ + rcu_read_lock(); + sch = rcu_dereference(scx_root); + rcu_read_unlock(); + if (sch) + sch->rex_base = base; + scx_rex_base_prog = base; + mutex_unlock(&scx_rex_mutex); + + pr_info("sched_ext_rex: === REX ENABLE COMPLETE === scheduler is ACTIVE\n"); + kvfree(syms); + kfree(ops); + return 0; + +free_ops: + kfree(ops); +free_syms: + kvfree(syms); + return err; +} +EXPORT_SYMBOL_GPL(scx_enable_rex); + +int scx_disable_rex(void) +{ + struct scx_sched *sch; + + pr_info("sched_ext_rex: === REX DISABLE START ===\n"); + + mutex_lock(&scx_rex_mutex); + if (!scx_rex_base_prog) { + /* Already torn down by another path (e.g. watchdog). */ + mutex_unlock(&scx_rex_mutex); + pr_info("sched_ext_rex: === REX DISABLE: already disabled ===\n"); + return 0; + } + rcu_read_lock(); + sch = rcu_dereference(scx_root); + rcu_read_unlock(); + mutex_unlock(&scx_rex_mutex); + + if (sch) { + pr_info("sched_ext_rex: disabling scheduler, switching tasks back to CFS ...\n"); + scx_disable(SCX_EXIT_UNREG); + kthread_flush_work(&sch->disable_work); + } + + pr_info("sched_ext_rex: === REX DISABLE COMPLETE === back to default scheduler\n"); + return 0; +} +EXPORT_SYMBOL_GPL(scx_disable_rex); + static void bpf_scx_unreg(void *kdata, struct bpf_link *link) { struct sched_ext_ops *ops = kdata; diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 00b450597f3e06..45c6f2d816c16e 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -916,6 +916,15 @@ struct scx_sched { struct irq_work error_irq_work; struct kthread_work disable_work; struct rcu_work rcu_work; + + /* + * Set when this scheduler was attached via BPF_SCHED_EXT_ATTACH_REX. + * Owns one bpf_prog_get() reference taken at attach time and the + * matching kobject_init_and_add() refcount; both are released by + * scx_disable_workfn() so cleanup happens for every disable kind + * (UNREG, ERROR_STALL, watchdog, sysrq, ...). + */ + struct bpf_prog *rex_base; }; enum scx_wake_flags { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 726684e73cf2a7..ffa425086580d0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -992,9 +992,11 @@ enum bpf_cmd { BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, BPF_PROG_STREAM_READ_BY_FD, - BPF_PROG_ASSOC_STRUCT_OPS, BPF_PROG_LOAD_REX_BASE, BPF_PROG_LOAD_REX, + BPF_SCHED_EXT_ATTACH_REX, + BPF_SCHED_EXT_DETACH_REX, + BPF_PROG_ASSOC_STRUCT_OPS, BPF_PROG_TERMINATE, __MAX_BPF_CMD, }; @@ -1510,6 +1512,11 @@ enum { BPF_STREAM_STDERR = 2, }; +struct rex_sched_ops_sym { + const char __user *name; + __u64 offset; +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -1920,6 +1927,17 @@ union bpf_attr { __u32 prog_fd; } prog_stream_read; + struct { /* BPF_SCHED_EXT_ATTACH_REX */ + __u32 base_prog_fd; + __aligned_u64 sched_ops_syms; /* ptr to rex_sched_ops_sym array */ + __u32 nr_sched_ops_syms; + __u32 timeout_ms; /* ops.timeout_ms, 0 = default */ + __u32 exit_dump_len; /* ops.exit_dump_len, 0 = default */ + __u32 pad; + __aligned_u64 ops_flags; /* SCX_OPS_* flags */ + char name[128]; /* ops.name; empty string = use base->aux->name */ + } sched_ext_attach; + struct { __u32 map_fd; __u32 prog_fd;