Please note that this patch applies cleanly to Ubuntu 9.04's Linux kernel sources (2.6.28-15-generic, specifically). Patches being submitted for consideration on the Linux kernel mailing list are not kept here, and are made against the current mainline (2.6.31-rc5 at the time of this writing). If you want those, get them from the mailing list. You probably don't want the below patch unless you're experimenting with a stable (and older) kernel. diff -ru linux-2.6.28-orig/arch/ia64/ia32/binfmt_elf32.c linux-2.6.28/arch/ia64/ia32/binfmt_elf32.c --- linux-2.6.28-orig/arch/ia64/ia32/binfmt_elf32.c 2008-12-24 18:26:37.000000000 -0500 +++ linux-2.6.28/arch/ia64/ia32/binfmt_elf32.c 2009-10-02 00:10:12.000000000 -0400 @@ -223,12 +223,12 @@ static unsigned long elf32_map(struct file *filep, unsigned long addr, struct elf_phdr *eppnt, - int prot, int type, unsigned long unused) + int prot, int type, unsigned long unused, unsigned long base) { unsigned long pgoff = (eppnt->p_vaddr) & ~IA32_PAGE_MASK; return ia32_do_mmap(filep, (addr & IA32_PAGE_MASK), eppnt->p_filesz + pgoff, prot, type, - eppnt->p_offset - pgoff); + (eppnt->p_offset + base) - pgoff); } #define cpu_uses_ia32el() (local_cpu_data->family > 0x1f) diff -ru linux-2.6.28-orig/fs/binfmt_elf.c linux-2.6.28/fs/binfmt_elf.c --- linux-2.6.28-orig/fs/binfmt_elf.c 2009-10-03 02:11:23.000000000 -0400 +++ linux-2.6.28/fs/binfmt_elf.c 2009-10-02 18:42:52.000000000 -0400 @@ -44,8 +44,9 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); static int load_elf_library(struct file *); -static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, - int, int, unsigned long); +static unsigned long +elf_map(struct file *, unsigned long, struct elf_phdr *, + int, int, unsigned long, unsigned long); /* * If we don't support core dumping, then supply a NULL so we @@ -314,7 +315,7 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, - unsigned long total_size) + unsigned long total_size, unsigned long base_offset) { unsigned long map_addr; unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr); @@ -338,11 +339,14 @@ */ if (total_size) { total_size = ELF_PAGEALIGN(total_size); - map_addr = do_mmap(filep, addr, total_size, prot, type, off); + map_addr = do_mmap(filep, addr, total_size, prot, type, + off + base_offset); if (!BAD_ADDR(map_addr)) do_munmap(current->mm, map_addr+size, total_size-size); - } else - map_addr = do_mmap(filep, addr, size, prot, type, off); + } else { + map_addr = do_mmap(filep, addr, size, prot, type, + off + base_offset); + } up_write(¤t->mm->mmap_sem); return(map_addr); @@ -376,7 +380,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, struct file *interpreter, unsigned long *interp_map_addr, - unsigned long no_base) + unsigned long no_base, unsigned long base_offset) { struct elf_phdr *elf_phdata; struct elf_phdr *eppnt; @@ -414,7 +418,7 @@ if (!elf_phdata) goto out; - retval = kernel_read(interpreter, interp_elf_ex->e_phoff, + retval = kernel_read(interpreter, interp_elf_ex->e_phoff + base_offset, (char *)elf_phdata,size); error = -EIO; if (retval != size) { @@ -450,7 +454,8 @@ load_addr = -vaddr; map_addr = elf_map(interpreter, load_addr + vaddr, - eppnt, elf_prot, elf_type, total_size); + eppnt, elf_prot, elf_type, total_size, + base_offset); total_size = 0; if (!*interp_map_addr) *interp_map_addr = map_addr; @@ -555,6 +560,94 @@ #endif } +/* + * See if we're a valid FatELF binary, find the right record, and + * load (*elf) with the actual ELF header. Sets (*offset) to the + * base offset of the chosen ELF binary. Returns 0 on success or a negative + * error code. + * If we're not a FatELF binary, (*elf) is loaded with the existing contents + * of (buf) and 0 is returned. + */ +static int examine_fatelf(struct file *file, const char *filename, char *buf, + int buflen, unsigned long *offset, struct elfhdr *elf) +{ + int records, i, rc; + const fatelf_hdr *fatelf = (fatelf_hdr *) buf; + + if (likely(le32_to_cpu(fatelf->magic) != FATELF_MAGIC)) { + *elf = *((struct elfhdr *)buf); /* treat like normal ELF. */ + return 0; /* not a FatELF binary; not an error. */ + } else if (unlikely(le16_to_cpu(fatelf->version) != 1)) { + return -ENOEXEC; /* Unrecognized format version. */ + } + + /* + * In theory, there could be 255 separate records packed into this + * binary, but for now, bprm->buf (128 bytes) holds exactly 5 + * records with the fatelf header, and that seems reasonable for + * most uses. We could add the complexity to read more records later + * if there's a serious need. + */ + records = (int) fatelf->num_records; /* uint8, no byteswap needed */ + + if (unlikely(records > 5)) { + records = 5; /* clamp, in case we find one we can use. */ + } + + for (i = 0; i < records; i++) { + const fatelf_record *record = &fatelf->records[i]; + const __u8 osabi = record->osabi; + const int abiok = likely( likely(osabi == ELFOSABI_NONE) || + unlikely(osabi == ELFOSABI_LINUX) ); + + /* Fill in the data elf_check_arch() might care about. */ + elf->e_ident[EI_OSABI] = record->osabi; + elf->e_ident[EI_CLASS] = record->word_size; + elf->e_ident[EI_DATA] = record->byte_order; + elf->e_machine = le16_to_cpu(record->machine); + + if (likely(!elf_check_arch(elf))) { + continue; /* Unsupported CPU architecture. */ + } else if (unlikely(!abiok)) { + continue; /* Unsupported OS ABI. */ + } else if (unlikely(record->osabi_version != 0)) { + continue; /* Unsupported OS ABI version. */ + } else { + /* We can support this ELF arch/abi. */ + const __u64 rec_offset = le64_to_cpu(record->offset); + const __u64 rec_size = le64_to_cpu(record->size); + const __u64 end_offset = rec_offset + rec_size; + const unsigned long uloff = (unsigned long) rec_offset; + + if (unlikely(end_offset < rec_offset)) { + continue; /* overflow (corrupt file?) */ + } else if (unlikely(ELF_PAGEOFFSET(uloff) != 0)) { + continue; /* bad alignment. */ + } + +#if BITS_PER_LONG == 32 + if (unlikely(end_offset > 0xFFFFFFFF)) { + continue; + } +#endif + + /* replace the FatELF data with the real ELF header. */ + rc = kernel_read(file, uloff, (char*) elf, sizeof(*elf)); + if (unlikely((rc != sizeof(*elf)) && (rc >= 0))) { + rc = -EIO; + } else if (likely(rc == sizeof(*elf))) { + *offset = uloff; + rc = 0; + } + + return rc; + } + } + + return -ENOEXEC; /* no binaries we could use. */ +} + + static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) { struct file *interpreter = NULL; /* to shut gcc up */ @@ -567,6 +660,8 @@ int elf_exec_fileno; int retval, i; unsigned int size; + unsigned long base_offset = 0; + unsigned long interp_base_offset = 0; unsigned long elf_entry; unsigned long interp_load_addr = 0; unsigned long start_code, end_code, start_data, end_data; @@ -583,9 +678,12 @@ retval = -ENOMEM; goto out_ret; } - - /* Get the exec-header */ - loc->elf_ex = *((struct elfhdr *)bprm->buf); + + retval = examine_fatelf(bprm->file, bprm->filename, bprm->buf, + BINPRM_BUF_SIZE, &base_offset, &loc->elf_ex); + if (unlikely(retval < 0)) { + goto out_ret; + } retval = -ENOEXEC; /* First of all, some simple consistency checks */ @@ -611,7 +709,7 @@ if (!elf_phdata) goto out; - retval = kernel_read(bprm->file, loc->elf_ex.e_phoff, + retval = kernel_read(bprm->file, loc->elf_ex.e_phoff + base_offset, (char *)elf_phdata, size); if (retval != size) { if (retval >= 0) @@ -651,7 +749,8 @@ if (!elf_interpreter) goto out_free_file; - retval = kernel_read(bprm->file, elf_ppnt->p_offset, + retval = kernel_read(bprm->file, + elf_ppnt->p_offset + base_offset, elf_interpreter, elf_ppnt->p_filesz); if (retval != elf_ppnt->p_filesz) { @@ -706,8 +805,13 @@ goto out_free_dentry; } - /* Get the exec headers */ - loc->interp_elf_ex = *((struct elfhdr *)bprm->buf); + retval = examine_fatelf(interpreter, elf_interpreter, + bprm->buf, BINPRM_BUF_SIZE, + &interp_base_offset, + &loc->interp_elf_ex); + if (unlikely(retval < 0)) { + goto out_free_dentry; + } break; } elf_ppnt++; @@ -832,7 +936,7 @@ } error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, - elf_prot, elf_flags, 0); + elf_prot, elf_flags, 0, base_offset); if (BAD_ADDR(error)) { send_sig(SIGKILL, current, 0); retval = IS_ERR((void *)error) ? @@ -913,7 +1017,7 @@ elf_entry = load_elf_interp(&loc->interp_elf_ex, interpreter, &interp_map_addr, - load_bias); + load_bias, interp_base_offset); if (!IS_ERR((void *)elf_entry)) { /* * load_elf_interp() returns relocation @@ -1032,11 +1136,19 @@ unsigned long elf_bss, bss, len; int retval, error, i, j; struct elfhdr elf_ex; + unsigned long base_offset = 0; + char buf[BINPRM_BUF_SIZE]; - error = -ENOEXEC; - retval = kernel_read(file, 0, (char *)&elf_ex, sizeof(elf_ex)); - if (retval != sizeof(elf_ex)) + retval = kernel_read(file, 0, buf, sizeof(buf)); + if (unlikely(retval != sizeof(buf))) { + error = (retval >= 0) ? -EIO : retval; goto out; + } + error = examine_fatelf(file, 0, buf, sizeof(buf), &base_offset, &elf_ex); + if (unlikely(retval < 0)) { + goto out; + } + error = -ENOEXEC; if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0) goto out; @@ -1058,7 +1170,8 @@ eppnt = elf_phdata; error = -ENOEXEC; - retval = kernel_read(file, elf_ex.e_phoff, (char *)eppnt, j); + retval = kernel_read(file, elf_ex.e_phoff + base_offset, + (char *)eppnt, j); if (retval != j) goto out_free_ph; @@ -1080,7 +1193,7 @@ PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, (eppnt->p_offset - - ELF_PAGEOFFSET(eppnt->p_vaddr))); + ELF_PAGEOFFSET(eppnt->p_vaddr)) + base_offset); up_write(¤t->mm->mmap_sem); if (error != ELF_PAGESTART(eppnt->p_vaddr)) goto out_free_ph; diff -ru linux-2.6.28-orig/include/linux/elf.h linux-2.6.28/include/linux/elf.h --- linux-2.6.28-orig/include/linux/elf.h 2008-12-24 18:26:37.000000000 -0500 +++ linux-2.6.28/include/linux/elf.h 2009-10-02 00:10:12.000000000 -0400 @@ -188,6 +188,30 @@ } Elf64_Sym; +/* FatELF (multiple ELF binaries in one file) support */ +#define FATELF_MAGIC (0x1F0E70FA) + +typedef struct fatelf_record { + __le16 machine; /* maps to e_machine */ + __u8 osabi; /* maps to e_ident[EI_OSABI] */ + __u8 osabi_version; /* maps to e_ident[EI_ABIVERSION] */ + __u8 word_size; /* maps to e_ident[EI_CLASS] */ + __u8 byte_order; /* maps to e_ident[EI_DATA] */ + __u8 reserved0; + __u8 reserved1; + __le64 offset; + __le64 size; +} fatelf_record; + +typedef struct fatelf_hdr { + __le32 magic; + __le16 version; + __u8 num_records; + __u8 reserved0; + fatelf_record records[]; +} fatelf_hdr; + + #define EI_NIDENT 16 typedef struct elf32_hdr{ diff -ru linux-2.6.28-orig/kernel/module.c linux-2.6.28/kernel/module.c --- linux-2.6.28-orig/kernel/module.c 2009-10-03 02:11:23.000000000 -0400 +++ linux-2.6.28/kernel/module.c 2009-10-03 02:08:53.000000000 -0400 @@ -1833,13 +1833,69 @@ return ret; } +/* + * See if we're a valid FatELF binary, find the right record, and + * return the offset of that record within the binary. Returns NULL if there's + * a problem, or a pointer to the real ELF header if we're okay. + * If we don't see the FatELF magic number, we assume this is a regular ELF + * binary and let the regular ELF checks handle it. + * + * This is a simplified version of examine_fatelf in fs/binfmt_elf.c + */ +static Elf_Ehdr *examine_fatelf_module(const unsigned char *hdr, + const unsigned long len) +{ + Elf_Ehdr elf; + int records, i; + const fatelf_hdr *fatelf = (const fatelf_hdr *) hdr; + + if (likely(le32_to_cpu(fatelf->magic) != FATELF_MAGIC)) { + return (Elf_Ehdr *) hdr; /* not FatELF; not an error. */ + } else if (unlikely(le16_to_cpu(fatelf->version) != 1)) { + return NULL; /* Unrecognized format version. */ + } + + memset(&elf, 0, sizeof (elf)); + + records = (int) fatelf->num_records; /* uint8, no byteswap needed */ + for (i = 0; i < records; i++) { + const fatelf_record *record = &fatelf->records[i]; + + /* Fill in the data elf_check_arch() might care about. */ + elf.e_ident[EI_OSABI] = record->osabi; + elf.e_ident[EI_CLASS] = record->word_size; + elf.e_ident[EI_DATA] = record->byte_order; + elf.e_machine = le16_to_cpu(record->machine); + + if (likely(!elf_check_arch(&elf))) { + continue; /* Unsupported CPU architecture. */ + } else { + const __u64 rec_offset = le64_to_cpu(record->offset); + const __u64 rec_size = le64_to_cpu(record->size); + const __u64 end_offset = rec_offset + rec_size; + const unsigned long uloff = (unsigned long) rec_offset; + + if (unlikely(end_offset < rec_offset)) { + continue; /* overflow (corrupt file?)... */ + } else if (unlikely(end_offset > len)) { + continue; /* past EOF. */ + } + + return (Elf_Ehdr *) (hdr + uloff); + } + } + + return NULL; /* no binaries we could use. */ +} + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ static noinline struct module *load_module(void __user *umod, unsigned long len, const char __user *uargs) { - Elf_Ehdr *hdr; + Elf_Ehdr *hdr_alloc; /* returned from vmalloc */ + Elf_Ehdr *hdr; /* adjusted hdr_alloc for FatELF */ Elf_Shdr *sechdrs; char *secstrings, *args, *modmagic, *strtab = NULL; char *staging; @@ -1863,13 +1919,19 @@ /* Suck in entire file: we'll want most of it. */ /* vmalloc barfs on "unusual" numbers. Check here */ - if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) + if (len > 64 * 1024 * 1024 || (hdr_alloc = vmalloc(len)) == NULL) return ERR_PTR(-ENOMEM); - if (copy_from_user(hdr, umod, len) != 0) { + if (copy_from_user(hdr_alloc, umod, len) != 0) { err = -EFAULT; goto free_hdr; } + hdr = examine_fatelf_module((unsigned char *) hdr_alloc, len); + if (hdr == NULL) { + err = -ENOEXEC; + goto free_hdr; + } + /* Sanity checks against insmoding binaries or wrong arch, weird elf version */ if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 @@ -2255,7 +2317,7 @@ sechdrs[unwindex].sh_size); /* Get rid of temporary copy */ - vfree(hdr); + vfree(hdr_alloc); /* Done! */ return mod; @@ -2278,7 +2340,7 @@ free_mod: kfree(args); free_hdr: - vfree(hdr); + vfree(hdr_alloc); return ERR_PTR(err); truncated: --- linux-2.6.28-orig/fs/Makefile 2008-12-24 18:26:37.000000000 -0500 +++ linux-2.6.28/fs/Makefile 2009-10-14 23:32:11.000000000 -0400 @@ -41,8 +41,9 @@ # binfmt_script is always there obj-y += binfmt_script.o -obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o +# List compat_* first, so they insert at end of the list, and are tried last. obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o +obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o