5a622f2d0f
Creating PDEs with refcount 0 and "deleted" flag has problems (see below). Switch to usual scheme: * PDE is created with refcount 1 * every de_get does +1 * every de_put() and remove_proc_entry() do -1 * once refcount reaches 0, PDE is freed. This elegantly fixes at least two following races (both observed) without introducing new locks, without abusing old locks, without spreading lock_kernel(): 1) PDE leak remove_proc_entry de_put ----------------- ------ [refcnt = 1] if (atomic_read(&de->count) == 0) if (atomic_dec_and_test(&de->count)) if (de->deleted) /* also not taken! */ free_proc_entry(de); else de->deleted = 1; [refcount=0, deleted=1] 2) use after free remove_proc_entry de_put ----------------- ------ [refcnt = 1] if (atomic_dec_and_test(&de->count)) if (atomic_read(&de->count) == 0) free_proc_entry(de); /* boom! */ if (de->deleted) free_proc_entry(de); BUG: unable to handle kernel paging request at virtual address 6b6b6b6b printing eip: c10acdda *pdpt = 00000000338f8001 *pde = 0000000000000000 Oops: 0000 [#1] PREEMPT SMP Modules linked in: af_packet ipv6 cpufreq_ondemand loop serio_raw psmouse k8temp hwmon sr_mod cdrom Pid: 23161, comm: cat Not tainted (2.6.24-rc2-8c0863403f109a43d7000b4646da4818220d501f #4) EIP: 0060:[<c10acdda>] EFLAGS: 00210097 CPU: 1 EIP is at strnlen+0x6/0x18 EAX: 6b6b6b6b EBX: 6b6b6b6b ECX: 6b6b6b6b EDX: fffffffe ESI: c128fa3b EDI: f380bf34 EBP: ffffffff ESP: f380be44 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 Process cat (pid: 23161, ti=f380b000 task=f38f2570 task.ti=f380b000) Stack: c10ac4f0 00000278 c12ce000 f43cd2a8 00000163 00000000 7da86067 00000400 c128fa20 00896b18 f38325a8 c128fe20 ffffffff 00000000 c11f291e 00000400 f75be300 c128fa20 f769c9a0 c10ac779 f380bf34 f7bfee70 c1018e6b f380bf34 Call Trace: [<c10ac4f0>] vsnprintf+0x2ad/0x49b [<c10ac779>] vscnprintf+0x14/0x1f [<c1018e6b>] vprintk+0xc5/0x2f9 [<c10379f1>] handle_fasteoi_irq+0x0/0xab [<c1004f44>] do_IRQ+0x9f/0xb7 [<c117db3b>] preempt_schedule_irq+0x3f/0x5b [<c100264e>] need_resched+0x1f/0x21 [<c10190ba>] printk+0x1b/0x1f [<c107c8ad>] de_put+0x3d/0x50 [<c107c8f8>] proc_delete_inode+0x38/0x41 [<c107c8c0>] proc_delete_inode+0x0/0x41 [<c1066298>] generic_delete_inode+0x5e/0xc6 [<c1065aa9>] iput+0x60/0x62 [<c1063c8e>] d_kill+0x2d/0x46 [<c1063fa9>] dput+0xdc/0xe4 [<c10571a1>] __fput+0xb0/0xcd [<c1054e49>] filp_close+0x48/0x4f [<c1055ee9>] sys_close+0x67/0xa5 [<c10026b6>] sysenter_past_esp+0x5f/0x85 ======================= Code: c9 74 0c f2 ae 74 05 bf 01 00 00 00 4f 89 fa 5f 89 d0 c3 85 c9 57 89 c7 89 d0 74 05 f2 ae 75 01 4f 89 f8 5f c3 89 c1 89 c8 eb 06 <80> 38 00 74 07 40 4a 83 fa ff 75 f4 29 c8 c3 90 90 90 57 83 c9 EIP: [<c10acdda>] strnlen+0x6/0x18 SS:ESP 0068:f380be44 Also, remove broken usage of ->deleted from reiserfs: if sget() succeeds, module is already pinned and remove_proc_entry() can't happen => nobody can mark PDE deleted. Dummy proc root in netns code is not marked with refcount 1. AFAICS, we never get it, it's just for proper /proc/net removal. I double checked CLONE_NETNS continues to work. Patch survives many hours of modprobe/rmmod/cat loops without new bugs which can be attributed to refcounting. Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
298 lines
8.6 KiB
C
298 lines
8.6 KiB
C
#ifndef _LINUX_PROC_FS_H
|
|
#define _LINUX_PROC_FS_H
|
|
|
|
#include <linux/slab.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/magic.h>
|
|
#include <asm/atomic.h>
|
|
|
|
struct net;
|
|
struct completion;
|
|
|
|
/*
|
|
* The proc filesystem constants/structures
|
|
*/
|
|
|
|
/*
|
|
* Offset of the first process in the /proc root directory..
|
|
*/
|
|
#define FIRST_PROCESS_ENTRY 256
|
|
|
|
|
|
/*
|
|
* We always define these enumerators
|
|
*/
|
|
|
|
enum {
|
|
PROC_ROOT_INO = 1,
|
|
};
|
|
|
|
/*
|
|
* This is not completely implemented yet. The idea is to
|
|
* create an in-memory tree (like the actual /proc filesystem
|
|
* tree) of these proc_dir_entries, so that we can dynamically
|
|
* add new files to /proc.
|
|
*
|
|
* The "next" pointer creates a linked list of one /proc directory,
|
|
* while parent/subdir create the directory structure (every
|
|
* /proc file has a parent, but "subdir" is NULL for all
|
|
* non-directory entries).
|
|
*
|
|
* "get_info" is called at "read", while "owner" is used to protect module
|
|
* from unloading while proc_dir_entry is in use
|
|
*/
|
|
|
|
typedef int (read_proc_t)(char *page, char **start, off_t off,
|
|
int count, int *eof, void *data);
|
|
typedef int (write_proc_t)(struct file *file, const char __user *buffer,
|
|
unsigned long count, void *data);
|
|
typedef int (get_info_t)(char *, char **, off_t, int);
|
|
typedef struct proc_dir_entry *(shadow_proc_t)(struct task_struct *task,
|
|
struct proc_dir_entry *pde);
|
|
|
|
struct proc_dir_entry {
|
|
unsigned int low_ino;
|
|
unsigned short namelen;
|
|
const char *name;
|
|
mode_t mode;
|
|
nlink_t nlink;
|
|
uid_t uid;
|
|
gid_t gid;
|
|
loff_t size;
|
|
const struct inode_operations *proc_iops;
|
|
/*
|
|
* NULL ->proc_fops means "PDE is going away RSN" or
|
|
* "PDE is just created". In either case, e.g. ->read_proc won't be
|
|
* called because it's too late or too early, respectively.
|
|
*
|
|
* If you're allocating ->proc_fops dynamically, save a pointer
|
|
* somewhere.
|
|
*/
|
|
const struct file_operations *proc_fops;
|
|
get_info_t *get_info;
|
|
struct module *owner;
|
|
struct proc_dir_entry *next, *parent, *subdir;
|
|
void *data;
|
|
read_proc_t *read_proc;
|
|
write_proc_t *write_proc;
|
|
atomic_t count; /* use count */
|
|
int pde_users; /* number of callers into module in progress */
|
|
spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
|
|
struct completion *pde_unload_completion;
|
|
shadow_proc_t *shadow_proc;
|
|
};
|
|
|
|
struct kcore_list {
|
|
struct kcore_list *next;
|
|
unsigned long addr;
|
|
size_t size;
|
|
};
|
|
|
|
struct vmcore {
|
|
struct list_head list;
|
|
unsigned long long paddr;
|
|
unsigned long long size;
|
|
loff_t offset;
|
|
};
|
|
|
|
#ifdef CONFIG_PROC_FS
|
|
|
|
extern struct proc_dir_entry proc_root;
|
|
extern struct proc_dir_entry *proc_root_fs;
|
|
extern struct proc_dir_entry *proc_bus;
|
|
extern struct proc_dir_entry *proc_root_driver;
|
|
extern struct proc_dir_entry *proc_root_kcore;
|
|
|
|
extern spinlock_t proc_subdir_lock;
|
|
|
|
extern void proc_root_init(void);
|
|
extern void proc_misc_init(void);
|
|
|
|
struct mm_struct;
|
|
|
|
void proc_flush_task(struct task_struct *task);
|
|
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
|
|
int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
|
|
unsigned long task_vsize(struct mm_struct *);
|
|
int task_statm(struct mm_struct *, int *, int *, int *, int *);
|
|
char *task_mem(struct mm_struct *, char *);
|
|
void clear_refs_smap(struct mm_struct *mm);
|
|
|
|
struct proc_dir_entry *de_get(struct proc_dir_entry *de);
|
|
void de_put(struct proc_dir_entry *de);
|
|
|
|
extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
|
|
struct proc_dir_entry *parent);
|
|
extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent);
|
|
|
|
extern struct vfsmount *proc_mnt;
|
|
struct pid_namespace;
|
|
extern int proc_fill_super(struct super_block *);
|
|
extern struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
|
|
|
|
/*
|
|
* These are generic /proc routines that use the internal
|
|
* "struct proc_dir_entry" tree to traverse the filesystem.
|
|
*
|
|
* The /proc root directory has extended versions to take care
|
|
* of the /proc/<pid> subdirectories.
|
|
*/
|
|
extern int proc_readdir(struct file *, void *, filldir_t);
|
|
extern struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
|
|
|
|
extern const struct file_operations proc_kcore_operations;
|
|
extern const struct file_operations proc_kmsg_operations;
|
|
extern const struct file_operations ppc_htab_operations;
|
|
|
|
extern int pid_ns_prepare_proc(struct pid_namespace *ns);
|
|
extern void pid_ns_release_proc(struct pid_namespace *ns);
|
|
|
|
/*
|
|
* proc_tty.c
|
|
*/
|
|
struct tty_driver;
|
|
extern void proc_tty_init(void);
|
|
extern void proc_tty_register_driver(struct tty_driver *driver);
|
|
extern void proc_tty_unregister_driver(struct tty_driver *driver);
|
|
|
|
/*
|
|
* proc_devtree.c
|
|
*/
|
|
#ifdef CONFIG_PROC_DEVICETREE
|
|
struct device_node;
|
|
struct property;
|
|
extern void proc_device_tree_init(void);
|
|
extern void proc_device_tree_add_node(struct device_node *, struct proc_dir_entry *);
|
|
extern void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop);
|
|
extern void proc_device_tree_remove_prop(struct proc_dir_entry *pde,
|
|
struct property *prop);
|
|
extern void proc_device_tree_update_prop(struct proc_dir_entry *pde,
|
|
struct property *newprop,
|
|
struct property *oldprop);
|
|
#endif /* CONFIG_PROC_DEVICETREE */
|
|
|
|
extern struct proc_dir_entry *proc_symlink(const char *,
|
|
struct proc_dir_entry *, const char *);
|
|
extern struct proc_dir_entry *proc_mkdir(const char *,struct proc_dir_entry *);
|
|
extern struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
|
|
struct proc_dir_entry *parent);
|
|
|
|
static inline struct proc_dir_entry *create_proc_read_entry(const char *name,
|
|
mode_t mode, struct proc_dir_entry *base,
|
|
read_proc_t *read_proc, void * data)
|
|
{
|
|
struct proc_dir_entry *res=create_proc_entry(name,mode,base);
|
|
if (res) {
|
|
res->read_proc=read_proc;
|
|
res->data=data;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
static inline struct proc_dir_entry *create_proc_info_entry(const char *name,
|
|
mode_t mode, struct proc_dir_entry *base, get_info_t *get_info)
|
|
{
|
|
struct proc_dir_entry *res=create_proc_entry(name,mode,base);
|
|
if (res) res->get_info=get_info;
|
|
return res;
|
|
}
|
|
|
|
extern struct proc_dir_entry *proc_net_fops_create(struct net *net,
|
|
const char *name, mode_t mode, const struct file_operations *fops);
|
|
extern void proc_net_remove(struct net *net, const char *name);
|
|
|
|
#else
|
|
|
|
#define proc_root_driver NULL
|
|
#define proc_bus NULL
|
|
|
|
#define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; })
|
|
static inline void proc_net_remove(struct net *net, const char *name) {}
|
|
|
|
static inline void proc_flush_task(struct task_struct *task)
|
|
{
|
|
}
|
|
|
|
static inline struct proc_dir_entry *create_proc_entry(const char *name,
|
|
mode_t mode, struct proc_dir_entry *parent) { return NULL; }
|
|
|
|
#define remove_proc_entry(name, parent) do {} while (0)
|
|
|
|
static inline struct proc_dir_entry *proc_symlink(const char *name,
|
|
struct proc_dir_entry *parent,const char *dest) {return NULL;}
|
|
static inline struct proc_dir_entry *proc_mkdir(const char *name,
|
|
struct proc_dir_entry *parent) {return NULL;}
|
|
|
|
static inline struct proc_dir_entry *create_proc_read_entry(const char *name,
|
|
mode_t mode, struct proc_dir_entry *base,
|
|
read_proc_t *read_proc, void * data) { return NULL; }
|
|
static inline struct proc_dir_entry *create_proc_info_entry(const char *name,
|
|
mode_t mode, struct proc_dir_entry *base, get_info_t *get_info)
|
|
{ return NULL; }
|
|
|
|
struct tty_driver;
|
|
static inline void proc_tty_register_driver(struct tty_driver *driver) {};
|
|
static inline void proc_tty_unregister_driver(struct tty_driver *driver) {};
|
|
|
|
extern struct proc_dir_entry proc_root;
|
|
|
|
static inline int pid_ns_prepare_proc(struct pid_namespace *ns)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline void pid_ns_release_proc(struct pid_namespace *ns)
|
|
{
|
|
}
|
|
|
|
#endif /* CONFIG_PROC_FS */
|
|
|
|
#if !defined(CONFIG_PROC_KCORE)
|
|
static inline void kclist_add(struct kcore_list *new, void *addr, size_t size)
|
|
{
|
|
}
|
|
#else
|
|
extern void kclist_add(struct kcore_list *, void *, size_t);
|
|
#endif
|
|
|
|
union proc_op {
|
|
int (*proc_get_link)(struct inode *, struct dentry **, struct vfsmount **);
|
|
int (*proc_read)(struct task_struct *task, char *page);
|
|
};
|
|
|
|
struct proc_inode {
|
|
struct pid *pid;
|
|
int fd;
|
|
union proc_op op;
|
|
struct proc_dir_entry *pde;
|
|
struct inode vfs_inode;
|
|
};
|
|
|
|
static inline struct proc_inode *PROC_I(const struct inode *inode)
|
|
{
|
|
return container_of(inode, struct proc_inode, vfs_inode);
|
|
}
|
|
|
|
static inline struct proc_dir_entry *PDE(const struct inode *inode)
|
|
{
|
|
return PROC_I(inode)->pde;
|
|
}
|
|
|
|
static inline struct net *PDE_NET(struct proc_dir_entry *pde)
|
|
{
|
|
return pde->parent->data;
|
|
}
|
|
|
|
struct net *get_proc_net(const struct inode *inode);
|
|
|
|
struct proc_maps_private {
|
|
struct pid *pid;
|
|
struct task_struct *task;
|
|
#ifdef CONFIG_MMU
|
|
struct vm_area_struct *tail_vma;
|
|
#endif
|
|
};
|
|
|
|
#endif /* _LINUX_PROC_FS_H */
|