From: Andrea Arcangeli Subject: seccomp for 2.6.11-rc3 Add seccomp mode to safely compute untrusted code. Signed-off-by: Andrea Arcangeli --- 2.6.11-rc3/arch/i386/Kconfig 2005-02-11 03:51:27.962342554 +0100 +++ seccomp/arch/i386/Kconfig 2005-02-11 03:56:40.050875895 +0100 @@ -888,6 +888,23 @@ config REGPARM generate incorrect output with certain kernel constructs when -mregparm=3 is used. +config SECCOMP + bool "Enable seccomp to safely compute untrusted bytecode" + depends on PROC_FS + default y + help + This kernel feature is useful for number crunching applications + that may need to compute untrusted bytecode during their + execution. By using pipes or other transports made available to + the process as file descriptors supporting the read/write + syscalls, it's possible to isolate those applications in + their own address space using seccomp. Once seccomp is + enabled via /proc//seccomp, it cannot be disabled + and the task is only allowed to execute a few safe syscalls + defined by each seccomp mode. + + If unsure, say Y. Only embedded should say N here. + endmenu --- 2.6.11-rc3/arch/i386/kernel/entry.S 2005-02-11 03:51:28.158278238 +0100 +++ seccomp/arch/i386/kernel/entry.S 2005-02-11 03:56:40.049876223 +0100 @@ -219,7 +219,8 @@ sysenter_past_esp: SAVE_ALL GET_THREAD_INFO(%ebp) - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ + testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -243,7 +244,8 @@ ENTRY(system_call) SAVE_ALL GET_THREAD_INFO(%ebp) # system call tracing in operation - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ + testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys --- 2.6.11-rc3/arch/i386/kernel/ptrace.c 2005-02-11 03:51:28.186269050 +0100 +++ seccomp/arch/i386/kernel/ptrace.c 2005-02-11 03:56:40.049876223 +0100 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -678,6 +679,9 @@ void send_sigtrap(struct task_struct *ts __attribute__((regparm(3))) void do_syscall_trace(struct pt_regs *regs, int entryexit) { + /* do the secure computing check first */ + secure_computing(regs->orig_eax); + if (unlikely(current->audit_context)) { if (!entryexit) audit_syscall_entry(current, regs->orig_eax, --- 2.6.11-rc3/arch/x86_64/ia32/ia32entry.S 2005-02-11 03:51:31.864061813 +0100 +++ seccomp/arch/x86_64/ia32/ia32entry.S 2005-02-11 03:56:40.051875566 +0100 @@ -78,7 +78,7 @@ ENTRY(ia32_sysenter_target) .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) jnz sysenter_tracesys sysenter_do_call: cmpl $(IA32_NR_syscalls),%eax @@ -163,7 +163,7 @@ ENTRY(ia32_cstar_target) .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) jnz cstar_tracesys cstar_do_call: cmpl $IA32_NR_syscalls,%eax @@ -236,7 +236,7 @@ ENTRY(ia32_syscall) this could be a problem. */ SAVE_ARGS 0,0,1 GET_THREAD_INFO(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) jnz ia32_tracesys ia32_do_syscall: cmpl $(IA32_NR_syscalls),%eax --- 2.6.11-rc3/arch/x86_64/Kconfig 2005-02-11 03:51:31.817077236 +0100 +++ seccomp/arch/x86_64/Kconfig 2005-02-11 03:56:40.052875238 +0100 @@ -350,6 +350,24 @@ config X86_MCE_INTEL help Additional support for intel specific MCE features such as the thermal monitor. + +config SECCOMP + bool "Enable seccomp to safely compute untrusted bytecode" + depends on PROC_FS + default y + help + This kernel feature is useful for number crunching applications + that may need to compute untrusted bytecode during their + execution. By using pipes or other transports made available to + the process as file descriptors supporting the read/write + syscalls, it's possible to isolate those applications in + their own address space using seccomp. Once seccomp is + enabled via /proc//seccomp, it cannot be disabled + and the task is only allowed to execute a few safe syscalls + defined by each seccomp mode. + + If unsure, say Y. Only embedded should say N here. + endmenu # --- 2.6.11-rc3/arch/x86_64/kernel/entry.S 2005-02-11 03:51:31.914045406 +0100 +++ seccomp/arch/x86_64/kernel/entry.S 2005-02-11 03:56:40.052875238 +0100 @@ -184,7 +184,7 @@ ENTRY(system_call) movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) jnz tracesys cmpq $__NR_syscall_max,%rax ja badsys --- 2.6.11-rc3/arch/x86_64/kernel/ptrace.c 2005-02-11 03:51:31.962029655 +0100 +++ seccomp/arch/x86_64/kernel/ptrace.c 2005-02-11 03:56:40.052875238 +0100 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -521,6 +522,9 @@ static void syscall_trace(struct pt_regs asmlinkage void syscall_trace_enter(struct pt_regs *regs) { + /* do the secure computing check first */ + secure_computing(regs->orig_rax); + if (unlikely(current->audit_context)) audit_syscall_entry(current, regs->orig_rax, regs->rdi, regs->rsi, --- 2.6.11-rc3/fs/proc/base.c 2005-02-11 03:51:42.431592978 +0100 +++ seccomp/fs/proc/base.c 2005-02-11 03:56:40.054874581 +0100 @@ -32,6 +32,7 @@ #include #include #include +#include #include "internal.h" /* @@ -49,6 +50,9 @@ enum pid_directory_inos { PROC_TGID_TASK, PROC_TGID_STATUS, PROC_TGID_MEM, +#ifdef CONFIG_SECCOMP + PROC_TGID_SECCOMP, +#endif PROC_TGID_CWD, PROC_TGID_ROOT, PROC_TGID_EXE, @@ -80,6 +84,9 @@ enum pid_directory_inos { PROC_TID_INO, PROC_TID_STATUS, PROC_TID_MEM, +#ifdef CONFIG_SECCOMP + PROC_TID_SECCOMP, +#endif PROC_TID_CWD, PROC_TID_ROOT, PROC_TID_EXE, @@ -130,6 +137,9 @@ static struct pid_entry tgid_base_stuff[ E(PROC_TGID_STATM, "statm", S_IFREG|S_IRUGO), E(PROC_TGID_MAPS, "maps", S_IFREG|S_IRUGO), E(PROC_TGID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), +#ifdef CONFIG_SECCOMP + E(PROC_TGID_SECCOMP, "seccomp", S_IFREG|S_IRUSR|S_IWUSR), +#endif E(PROC_TGID_CWD, "cwd", S_IFLNK|S_IRWXUGO), E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO), E(PROC_TGID_EXE, "exe", S_IFLNK|S_IRWXUGO), @@ -160,6 +170,9 @@ static struct pid_entry tid_base_stuff[] E(PROC_TID_STATM, "statm", S_IFREG|S_IRUGO), E(PROC_TID_MAPS, "maps", S_IFREG|S_IRUGO), E(PROC_TID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), +#ifdef CONFIG_SECCOMP + E(PROC_TID_SECCOMP, "seccomp", S_IFREG|S_IRUSR|S_IWUSR), +#endif E(PROC_TID_CWD, "cwd", S_IFLNK|S_IRWXUGO), E(PROC_TID_ROOT, "root", S_IFLNK|S_IRWXUGO), E(PROC_TID_EXE, "exe", S_IFLNK|S_IRWXUGO), @@ -808,6 +821,61 @@ static struct file_operations proc_login }; #endif +#ifdef CONFIG_SECCOMP +static ssize_t seccomp_read(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct * tsk = proc_task(file->f_dentry->d_inode); + char __buf[20]; + loff_t __ppos = *ppos; + size_t len; + + /* no need to print the trailing zero, so use only len */ + len = sprintf(__buf, "%u\n", tsk->seccomp.mode); + if (__ppos >= len) + return 0; + if (count > len-__ppos) + count = len-__ppos; + if (copy_to_user(buf, __buf + __ppos, count)) + return -EFAULT; + *ppos = __ppos + count; + return count; +} + +static ssize_t seccomp_write(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct * tsk = proc_task(file->f_dentry->d_inode); + char __buf[20], * end; + unsigned int seccomp_mode; + + /* can set it only once to be even more secure */ + if (unlikely(tsk->seccomp.mode)) + return -EPERM; + + memset(__buf, 0, 20); + if (count > 19) + count = 19; + if (copy_from_user(__buf, buf, count)) + return -EFAULT; + seccomp_mode = simple_strtoul(__buf, &end, 0); + if (*end == '\n') + end++; + if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { + tsk->seccomp.mode = seccomp_mode; + set_tsk_thread_flag(tsk, TIF_SECCOMP); + } + if (unlikely(!(end - __buf))) + return -EIO; + return end - __buf; +} + +static struct file_operations proc_seccomp_operations = { + .read = seccomp_read, + .write = seccomp_write, +}; +#endif /* CONFIG_SECCOMP */ + static int proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -1443,6 +1511,12 @@ static struct dentry *proc_pident_lookup inode->i_op = &proc_mem_inode_operations; inode->i_fop = &proc_mem_operations; break; +#ifdef CONFIG_SECCOMP + case PROC_TID_SECCOMP: + case PROC_TGID_SECCOMP: + inode->i_fop = &proc_seccomp_operations; + break; +#endif /* CONFIG_SECCOMP */ case PROC_TID_MOUNTS: case PROC_TGID_MOUNTS: inode->i_fop = &proc_mounts_operations; --- 2.6.11-rc3/include/asm-i386/thread_info.h 2005-02-11 03:51:42.979413152 +0100 +++ seccomp/include/asm-i386/thread_info.h 2005-02-11 03:56:40.054874581 +0100 @@ -140,6 +140,7 @@ register unsigned long current_stack_poi #define TIF_SINGLESTEP 4 /* restore singlestep on return to user mode */ #define TIF_IRET 5 /* return with iret */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ +#define TIF_SECCOMP 8 /* secure computing */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 17 @@ -150,12 +151,14 @@ register unsigned long current_stack_poi #define _TIF_SINGLESTEP (1< #include #include +#include struct exec_domain; @@ -643,6 +644,7 @@ struct task_struct { void *security; struct audit_context *audit_context; + seccomp_t seccomp; /* Thread group tracking */ u32 parent_exec_id; --- 2.6.11-rc3/include/linux/seccomp.h 1970-01-01 01:00:00.000000000 +0100 +++ seccomp/include/linux/seccomp.h 2005-02-11 03:56:40.059872940 +0100 @@ -0,0 +1,33 @@ +#ifndef _LINUX_SECCOMP_H +#define _LINUX_SECCOMP_H + +#include + +#ifdef CONFIG_SECCOMP + +#define NR_SECCOMP_MODES 1 + +#include + +typedef struct { int mode; } seccomp_t; + +extern void __secure_computing(int); +static inline void secure_computing(int this_syscall) +{ + if (unlikely(test_thread_flag(TIF_SECCOMP))) + __secure_computing(this_syscall); +} + +#else /* CONFIG_SECCOMP */ + +#if (__GNUC__ > 2) + typedef struct { } seccomp_t; +#else + typedef struct { int gcc_is_buggy; } seccomp_t; +#endif + +#define secure_computing(x) do { } while (0) + +#endif /* CONFIG_SECCOMP */ + +#endif /* _LINUX_SECCOMP_H */ --- 2.6.11-rc3/kernel/Makefile 2005-01-04 01:13:30.000000000 +0100 +++ seccomp/kernel/Makefile 2005-02-11 03:56:40.059872940 +0100 @@ -26,6 +26,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ +obj-$(CONFIG_SECCOMP) += seccomp.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is --- 2.6.11-rc3/kernel/seccomp.c 1970-01-01 01:00:00.000000000 +0100 +++ seccomp/kernel/seccomp.c 2005-02-11 03:56:40.060872612 +0100 @@ -0,0 +1,74 @@ +/* + * linux/kernel/seccomp.c + * + * Copyright 2004-2005 Andrea Arcangeli + * + * This defines a simple but solid secure-computing mode. + */ + +#include +#include +#include +#ifdef TIF_IA32 +#include +#endif + +/* #define SECCOMP_DEBUG 1 */ + +/* + * Secure computing mode 1 allows only read/write/exit/sigreturn. + * To be fully secure this must be combined with rlimit + * to limit the stack allocations too. + */ +static int mode1_syscalls[] = { + __NR_read, __NR_write, __NR_exit, + /* + * Allow either sigreturn or rt_sigreturn, newer archs + * like x86-64 only defines __NR_rt_sigreturn. + */ +#ifdef __NR_sigreturn + __NR_sigreturn, +#else + __NR_rt_sigreturn, +#endif + 0, /* null terminated */ +}; + +#ifdef TIF_IA32 +static int mode1_syscalls_32bit[] = { + __NR_ia32_read, __NR_ia32_write, __NR_ia32_exit, + /* + * Allow either sigreturn or rt_sigreturn, newer archs + * like x86-64 only defines __NR_rt_sigreturn. + */ + __NR_ia32_sigreturn, + 0, /* null terminated */ +}; +#endif + +void __secure_computing(int this_syscall) +{ + int mode = current->seccomp.mode; + int * syscall; + + switch (mode) { + case 1: + syscall = mode1_syscalls; +#ifdef TIF_IA32 + if (test_thread_flag(TIF_IA32)) + syscall = mode1_syscalls_32bit; +#endif + do { + if (*syscall == this_syscall) + return; + } while (*++syscall); + break; + default: + BUG(); + } + +#ifdef SECCOMP_DEBUG + dump_stack(); +#endif + do_exit(SIGKILL); +}