Merge branch 'timers-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'timers-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
clockevents: remove WARN_ON which was used to gather information
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 680fb56..8362860 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -144,8 +144,8 @@
void (*kill_sb) (struct super_block *);
locking rules:
may block BKL
-get_sb yes yes
-kill_sb yes yes
+get_sb yes no
+kill_sb yes no
->get_sb() returns error or 0 with locked superblock attached to the vfsmount
(exclusive on ->s_umount).
@@ -409,12 +409,12 @@
unlocked_ioctl: no (see below)
compat_ioctl: no
mmap: no
-open: maybe (see below)
+open: no
flush: no
release: no
fsync: no (see below)
aio_fsync: no
-fasync: yes (see below)
+fasync: no
lock: yes
readv: no
writev: no
@@ -431,13 +431,6 @@
semaphore. Note some filesystems (i.e. remote ones) provide no
protection for i_size so you will need to use the BKL.
-->open() locking is in-transit: big lock partially moved into the methods.
-The only exception is ->open() in the instances of file_operations that never
-end up in ->i_fop/->proc_fops, i.e. ones that belong to character devices
-(chrdev_open() takes lock before replacing ->f_op and calling the secondary
-method. As soon as we fix the handling of module reference counters all
-instances of ->open() will be called without the BKL.
-
Note: ext2_release() was *the* source of contention on fs-intensive
loads and dropping BKL on ->release() helps to get rid of that (we still
grab BKL for cases when we close a file that had been opened r/w, but that
diff --git a/MAINTAINERS b/MAINTAINERS
index b3e92fb..186be3b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -750,11 +750,13 @@
M: syrjala@sci.fi
S: Maintained
-ATL1 ETHERNET DRIVER
+ATLX ETHERNET DRIVERS
P: Jay Cliburn
M: jcliburn@gmail.com
P: Chris Snook
M: csnook@redhat.com
+P: Jie Yang
+M: jie.yang@atheros.com
L: atl1-devel@lists.sourceforge.net
W: http://sourceforge.net/projects/atl1
W: http://atl1.sourceforge.net
diff --git a/arch/ia64/include/asm/sections.h b/arch/ia64/include/asm/sections.h
index 7286e4a..a7acad2 100644
--- a/arch/ia64/include/asm/sections.h
+++ b/arch/ia64/include/asm/sections.h
@@ -21,5 +21,8 @@
extern char __start_unwind[], __end_unwind[];
extern char __start_ivt_text[], __end_ivt_text[];
+#undef dereference_function_descriptor
+void *dereference_function_descriptor(void *);
+
#endif /* _ASM_IA64_SECTIONS_H */
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
index 29aad34..545626f 100644
--- a/arch/ia64/kernel/module.c
+++ b/arch/ia64/kernel/module.c
@@ -31,9 +31,11 @@
#include <linux/elf.h>
#include <linux/moduleloader.h>
#include <linux/string.h>
+#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <asm/patch.h>
+#include <asm/sections.h>
#include <asm/unaligned.h>
#define ARCH_MODULE_DEBUG 0
@@ -941,3 +943,13 @@
if (mod->arch.core_unw_table)
unw_remove_unwind_table(mod->arch.core_unw_table);
}
+
+void *dereference_function_descriptor(void *ptr)
+{
+ struct fdesc *desc = ptr;
+ void *p;
+
+ if (!probe_kernel_address(&desc->ip, p))
+ ptr = p;
+ return ptr;
+}
diff --git a/arch/mips/sgi-ip22/ip22-platform.c b/arch/mips/sgi-ip22/ip22-platform.c
index 6014123..52486c4d 100644
--- a/arch/mips/sgi-ip22/ip22-platform.c
+++ b/arch/mips/sgi-ip22/ip22-platform.c
@@ -150,7 +150,7 @@
return res;
/* Second HPC is missing? */
- if (!ip22_is_fullhouse() ||
+ if (ip22_is_fullhouse() ||
get_dbe(tmp, (unsigned int *)&hpc3c1->pbdma[1]))
return 0;
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index fdacdd4..44138c3 100644
--- a/arch/parisc/kernel/module.c
+++ b/arch/parisc/kernel/module.c
@@ -47,7 +47,9 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/bug.h>
+#include <linux/uaccess.h>
+#include <asm/sections.h>
#include <asm/unwind.h>
#if 0
@@ -860,3 +862,15 @@
deregister_unwind_table(mod);
module_bug_cleanup(mod);
}
+
+#ifdef CONFIG_64BIT
+void *dereference_function_descriptor(void *ptr)
+{
+ Elf64_Fdesc *desc = ptr;
+ void *p;
+
+ if (!probe_kernel_address(&desc->addr, p))
+ ptr = p;
+ return ptr;
+}
+#endif
diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h
index 916018e..7710e9e 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -16,6 +16,9 @@
return 0;
}
+#undef dereference_function_descriptor
+void *dereference_function_descriptor(void *);
+
#endif
#endif /* __KERNEL__ */
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index ee6a298..ad79de2 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -21,8 +21,9 @@
#include <linux/err.h>
#include <linux/vmalloc.h>
#include <linux/bug.h>
+#include <linux/uaccess.h>
#include <asm/module.h>
-#include <asm/uaccess.h>
+#include <asm/sections.h>
#include <asm/firmware.h>
#include <asm/code-patching.h>
#include <linux/sort.h>
@@ -451,3 +452,13 @@
return 0;
}
+
+void *dereference_function_descriptor(void *ptr)
+{
+ struct ppc64_opd_entry *desc = ptr;
+ void *p;
+
+ if (!probe_kernel_address(&desc->funcaddr, p))
+ ptr = p;
+ return ptr;
+}
diff --git a/arch/s390/kernel/compat_ptrace.h b/arch/s390/kernel/compat_ptrace.h
index cde81fa..a2be3a9 100644
--- a/arch/s390/kernel/compat_ptrace.h
+++ b/arch/s390/kernel/compat_ptrace.h
@@ -42,6 +42,7 @@
u32 gprs[NUM_GPRS];
u32 acrs[NUM_ACRS];
u32 orig_gpr2;
+ /* nb: there's a 4-byte hole here */
s390_fp_regs fp_regs;
/*
* These per registers are in here so that gdb can modify them
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 2815bfe..c8b0828 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -170,6 +170,13 @@
*/
tmp = (addr_t) task_pt_regs(child)->orig_gpr2;
+ } else if (addr < (addr_t) &dummy->regs.fp_regs) {
+ /*
+ * prevent reads of padding hole between
+ * orig_gpr2 and fp_regs on s390.
+ */
+ tmp = 0;
+
} else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) {
/*
* floating point regs. are stored in the thread structure
@@ -270,6 +277,13 @@
*/
task_pt_regs(child)->orig_gpr2 = data;
+ } else if (addr < (addr_t) &dummy->regs.fp_regs) {
+ /*
+ * prevent writes of padding hole between
+ * orig_gpr2 and fp_regs on s390.
+ */
+ return 0;
+
} else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) {
/*
* floating point regs. are stored in the thread structure
@@ -428,6 +442,13 @@
*/
tmp = *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4);
+ } else if (addr < (addr_t) &dummy32->regs.fp_regs) {
+ /*
+ * prevent reads of padding hole between
+ * orig_gpr2 and fp_regs on s390.
+ */
+ tmp = 0;
+
} else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) {
/*
* floating point regs. are stored in the thread structure
@@ -514,6 +535,13 @@
*/
*(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4) = tmp;
+ } else if (addr < (addr_t) &dummy32->regs.fp_regs) {
+ /*
+ * prevent writess of padding hole between
+ * orig_gpr2 and fp_regs on s390.
+ */
+ return 0;
+
} else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) {
/*
* floating point regs. are stored in the thread structure
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 2c518fb..b225219 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -382,14 +382,17 @@
# P6_NOPs are a relatively minor optimization that require a family >=
# 6 processor, except that it is broken on certain VIA chips.
# Furthermore, AMD chips prefer a totally different sequence of NOPs
-# (which work on all CPUs). As a result, disallow these if we're
-# compiling X86_GENERIC but not X86_64 (these NOPs do work on all
-# x86-64 capable chips); the list of processors in the right-hand clause
-# are the cores that benefit from this optimization.
+# (which work on all CPUs). In addition, it looks like Virtual PC
+# does not understand them.
+#
+# As a result, disallow these if we're not compiling for X86_64 (these
+# NOPs do work on all x86-64 capable chips); the list of processors in
+# the right-hand clause are the cores that benefit from this optimization.
#
config X86_P6_NOP
def_bool y
- depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4 || MPSC)
+ depends on X86_64
+ depends on (MCORE2 || MPENTIUM4 || MPSC)
config X86_TSC
def_bool y
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 9af8907..66e48aa 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1203,7 +1203,7 @@
if (!p)
return -EINVAL;
- if (!strcmp(p, "exactmap")) {
+ if (!strncmp(p, "exactmap", 8)) {
#ifdef CONFIG_CRASH_DUMP
/*
* If we are doing a crash dump, we still need to know
diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c
index db00b05..f1216cf 100644
--- a/drivers/s390/cio/chp.c
+++ b/drivers/s390/cio/chp.c
@@ -423,7 +423,7 @@
ret = sysfs_create_group(&chp->dev.kobj, &chp_attr_group);
if (ret) {
device_unregister(&chp->dev);
- goto out_free;
+ goto out;
}
mutex_lock(&channel_subsystems[chpid.cssid]->mutex);
if (channel_subsystems[chpid.cssid]->cm_enabled) {
@@ -432,14 +432,15 @@
sysfs_remove_group(&chp->dev.kobj, &chp_attr_group);
device_unregister(&chp->dev);
mutex_unlock(&channel_subsystems[chpid.cssid]->mutex);
- goto out_free;
+ goto out;
}
}
channel_subsystems[chpid.cssid]->chps[chpid.id] = chp;
mutex_unlock(&channel_subsystems[chpid.cssid]->mutex);
- return ret;
+ goto out;
out_free:
kfree(chp);
+out:
return ret;
}
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index 33bff8f..5954b90 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -208,8 +208,10 @@
case 1: /* status pending */
case 2: /* busy */
return -EBUSY;
- default: /* device/path not operational */
+ case 3: /* device/path not operational */
return cio_start_handle_notoper(sch, lpm);
+ default:
+ return ccode;
}
}
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index 51489ef..1261e1a 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -633,6 +633,11 @@
css = to_css(dev);
mutex_destroy(&css->mutex);
+ if (css->pseudo_subchannel) {
+ /* Implies that it has been generated but never registered. */
+ css_subchannel_release(&css->pseudo_subchannel->dev);
+ css->pseudo_subchannel = NULL;
+ }
kfree(css);
}
@@ -785,11 +790,15 @@
}
channel_subsystems[i] = css;
ret = setup_css(i);
- if (ret)
- goto out_free;
+ if (ret) {
+ kfree(channel_subsystems[i]);
+ goto out_unregister;
+ }
ret = device_register(&css->device);
- if (ret)
- goto out_free_all;
+ if (ret) {
+ put_device(&css->device);
+ goto out_unregister;
+ }
if (css_chsc_characteristics.secm) {
ret = device_create_file(&css->device,
&dev_attr_cm_enable);
@@ -802,7 +811,7 @@
}
ret = register_reboot_notifier(&css_reboot_notifier);
if (ret)
- goto out_pseudo;
+ goto out_unregister;
css_init_done = 1;
/* Enable default isc for I/O subchannels. */
@@ -810,18 +819,12 @@
for_each_subchannel(__init_channel_subsystem, NULL);
return 0;
-out_pseudo:
- device_unregister(&channel_subsystems[i]->pseudo_subchannel->dev);
out_file:
- device_remove_file(&channel_subsystems[i]->device,
- &dev_attr_cm_enable);
+ if (css_chsc_characteristics.secm)
+ device_remove_file(&channel_subsystems[i]->device,
+ &dev_attr_cm_enable);
out_device:
device_unregister(&channel_subsystems[i]->device);
-out_free_all:
- kfree(channel_subsystems[i]->pseudo_subchannel->lock);
- kfree(channel_subsystems[i]->pseudo_subchannel);
-out_free:
- kfree(channel_subsystems[i]);
out_unregister:
while (i > 0) {
struct channel_subsystem *css;
@@ -829,6 +832,7 @@
i--;
css = channel_subsystems[i];
device_unregister(&css->pseudo_subchannel->dev);
+ css->pseudo_subchannel = NULL;
if (css_chsc_characteristics.secm)
device_remove_file(&css->device,
&dev_attr_cm_enable);
diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c
index 550508d..84cc9ea 100644
--- a/drivers/s390/cio/device_fsm.c
+++ b/drivers/s390/cio/device_fsm.c
@@ -658,6 +658,13 @@
{
struct subchannel *sch;
+ /* Allow ccw_device_offline while disconnected. */
+ if (cdev->private->state == DEV_STATE_DISCONNECTED ||
+ cdev->private->state == DEV_STATE_NOT_OPER) {
+ cdev->private->flags.donotify = 0;
+ ccw_device_done(cdev, DEV_STATE_NOT_OPER);
+ return 0;
+ }
if (ccw_device_is_orphan(cdev)) {
ccw_device_done(cdev, DEV_STATE_OFFLINE);
return 0;
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index d4427cb..2e15da54 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -60,7 +60,7 @@
#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
-#define BALLOON_CLASS_NAME "memory"
+#define BALLOON_CLASS_NAME "xen_memory"
struct balloon_stats {
/* We aim for 'current allocation' == 'target allocation'. */
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 15409815..73db464 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -302,18 +302,6 @@
int subtract_lebs;
long long available;
- /*
- * Force the amount available to the total size reported if the used
- * space is zero.
- */
- if (c->lst.total_used <= UBIFS_INO_NODE_SZ &&
- c->budg_data_growth + c->budg_dd_growth == 0) {
- /* Do the same calculation as for c->block_cnt */
- available = c->main_lebs - 2;
- available *= c->leb_size - c->dark_wm;
- return available;
- }
-
available = c->main_bytes - c->lst.total_used;
/*
@@ -714,34 +702,106 @@
}
/**
- * ubifs_budg_get_free_space - return amount of free space.
+ * ubifs_reported_space - calculate reported free space.
+ * @c: the UBIFS file-system description object
+ * @free: amount of free space
+ *
+ * This function calculates amount of free space which will be reported to
+ * user-space. User-space application tend to expect that if the file-system
+ * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
+ * are able to write a file of size N. UBIFS attaches node headers to each data
+ * node and it has to write indexind nodes as well. This introduces additional
+ * overhead, and UBIFS it has to report sligtly less free space to meet the
+ * above expectetion.
+ *
+ * This function assumes free space is made up of uncompressed data nodes and
+ * full index nodes (one per data node, tripled because we always allow enough
+ * space to write the index thrice).
+ *
+ * Note, the calculation is pessimistic, which means that most of the time
+ * UBIFS reports less space than it actually has.
+ */
+long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
+{
+ int divisor, factor, f;
+
+ /*
+ * Reported space size is @free * X, where X is UBIFS block size
+ * divided by UBIFS block size + all overhead one data block
+ * introduces. The overhead is the node header + indexing overhead.
+ *
+ * Indexing overhead calculations are based on the following formula:
+ * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number
+ * of data nodes, f - fanout. Because effective UBIFS fanout is twice
+ * as less than maximum fanout, we assume that each data node
+ * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
+ * Note, the multiplier 3 is because UBIFS reseves thrice as more space
+ * for the index.
+ */
+ f = c->fanout > 3 ? c->fanout >> 1 : 2;
+ factor = UBIFS_BLOCK_SIZE;
+ divisor = UBIFS_MAX_DATA_NODE_SZ;
+ divisor += (c->max_idx_node_sz * 3) / (f - 1);
+ free *= factor;
+ do_div(free, divisor);
+ return free;
+}
+
+/**
+ * ubifs_get_free_space - return amount of free space.
* @c: UBIFS file-system description object
*
- * This function returns amount of free space on the file-system.
+ * This function calculates amount of free space to report to user-space.
+ *
+ * Because UBIFS may introduce substantial overhead (the index, node headers,
+ * alighment, wastage at the end of eraseblocks, etc), it cannot report real
+ * amount of free flash space it has (well, because not all dirty space is
+ * reclamable, UBIFS does not actually know the real amount). If UBIFS did so,
+ * it would bread user expectetion about what free space is. Users seem to
+ * accustomed to assume that if the file-system reports N bytes of free space,
+ * they would be able to fit a file of N bytes to the FS. This almost works for
+ * traditional file-systems, because they have way less overhead than UBIFS.
+ * So, to keep users happy, UBIFS tries to take the overhead into account.
*/
-long long ubifs_budg_get_free_space(struct ubifs_info *c)
+long long ubifs_get_free_space(struct ubifs_info *c)
{
- int min_idx_lebs, rsvd_idx_lebs;
+ int min_idx_lebs, rsvd_idx_lebs, lebs;
long long available, outstanding, free;
- /* Do exactly the same calculations as in 'do_budget_space()' */
spin_lock(&c->space_lock);
min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+ outstanding = c->budg_data_growth + c->budg_dd_growth;
+ /*
+ * Force the amount available to the total size reported if the used
+ * space is zero.
+ */
+ if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
+ spin_unlock(&c->space_lock);
+ return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
+ }
+
+ available = ubifs_calc_available(c, min_idx_lebs);
+
+ /*
+ * When reporting free space to user-space, UBIFS guarantees that it is
+ * possible to write a file of free space size. This means that for
+ * empty LEBs we may use more precise calculations than
+ * 'ubifs_calc_available()' is using. Namely, we know that in empty
+ * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm.
+ * Thus, amend the available space.
+ *
+ * Note, the calculations below are similar to what we have in
+ * 'do_budget_space()', so refer there for comments.
+ */
if (min_idx_lebs > c->lst.idx_lebs)
rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
else
rsvd_idx_lebs = 0;
-
- if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt
- - c->lst.taken_empty_lebs) {
- spin_unlock(&c->space_lock);
- return 0;
- }
-
- available = ubifs_calc_available(c, min_idx_lebs);
- outstanding = c->budg_data_growth + c->budg_dd_growth;
- c->min_idx_lebs = min_idx_lebs;
+ lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+ c->lst.taken_empty_lebs;
+ lebs -= rsvd_idx_lebs;
+ available += lebs * (c->dark_wm - c->leb_overhead);
spin_unlock(&c->space_lock);
if (available > outstanding)
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 5c96f1f..2b267c9 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -587,7 +587,6 @@
if (err) {
if (err != -ENOSPC)
return err;
- err = 0;
budgeted = 0;
}
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 4071d1c..3d698e2 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -793,7 +793,7 @@
int err;
struct ubifs_budget_req req;
loff_t old_size = inode->i_size, new_size = attr->ia_size;
- int offset = new_size & (UBIFS_BLOCK_SIZE - 1);
+ int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1;
struct ubifs_inode *ui = ubifs_inode(inode);
dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
@@ -811,8 +811,15 @@
/* A funny way to budget for truncation node */
req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
err = ubifs_budget_space(c, &req);
- if (err)
- return err;
+ if (err) {
+ /*
+ * Treat truncations to zero as deletion and always allow them,
+ * just like we do for '->unlink()'.
+ */
+ if (new_size || err != -ENOSPC)
+ return err;
+ budgeted = 0;
+ }
err = vmtruncate(inode, new_size);
if (err)
@@ -869,7 +876,12 @@
err = ubifs_jnl_truncate(c, inode, old_size, new_size);
mutex_unlock(&ui->ui_mutex);
out_budg:
- ubifs_release_budget(c, &req);
+ if (budgeted)
+ ubifs_release_budget(c, &req);
+ else {
+ c->nospace = c->nospace_rp = 0;
+ smp_wmb();
+ }
return err;
}
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index adee7b5..e045c8b 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -211,14 +211,8 @@
* dirty index heap, and it falls-back to LPT scanning if the heaps are empty
* or do not have an LEB which satisfies the @min_space criteria.
*
- * Note:
- * o LEBs which have less than dead watermark of dirty space are never picked
- * by this function;
- *
- * Returns zero and the LEB properties of
- * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a
- * negative error code in case of other failures. The returned LEB is marked as
- * "taken".
+ * Note, LEBs which have less than dead watermark of free + dirty space are
+ * never picked by this function.
*
* The additional @pick_free argument controls if this function has to return a
* free or freeable LEB if one is present. For example, GC must to set it to %1,
@@ -231,6 +225,10 @@
*
* In addition @pick_free is set to %2 by the recovery process in order to
* recover gc_lnum in which case an index LEB must not be returned.
+ *
+ * This function returns zero and the LEB properties of found dirty LEB in case
+ * of success, %-ENOSPC if no dirty LEB was found and a negative error code in
+ * case of other failures. The returned LEB is marked as "taken".
*/
int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
int min_space, int pick_free)
@@ -245,7 +243,7 @@
int lebs, rsvd_idx_lebs = 0;
spin_lock(&c->space_lock);
- lebs = c->lst.empty_lebs;
+ lebs = c->lst.empty_lebs + c->idx_gc_cnt;
lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
/*
@@ -317,7 +315,7 @@
lp = idx_lp;
if (lp) {
- ubifs_assert(lp->dirty >= c->dead_wm);
+ ubifs_assert(lp->free + lp->dirty >= c->dead_wm);
goto found;
}
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index d0f3dac..13f1019 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -344,6 +344,12 @@
if (err)
goto out;
+ /* Allow for races with TNC */
+ c->gced_lnum = lnum;
+ smp_wmb();
+ c->gc_seq += 1;
+ smp_wmb();
+
if (c->gc_lnum == -1) {
c->gc_lnum = lnum;
err = LEB_RETAINED;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 87dabf9..4c12a92 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -284,38 +284,6 @@
}
/**
- * ubifs_reported_space - calculate reported free space.
- * @c: the UBIFS file-system description object
- * @free: amount of free space
- *
- * This function calculates amount of free space which will be reported to
- * user-space. User-space application tend to expect that if the file-system
- * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
- * are able to write a file of size N. UBIFS attaches node headers to each data
- * node and it has to write indexind nodes as well. This introduces additional
- * overhead, and UBIFS it has to report sligtly less free space to meet the
- * above expectetion.
- *
- * This function assumes free space is made up of uncompressed data nodes and
- * full index nodes (one per data node, doubled because we always allow enough
- * space to write the index twice).
- *
- * Note, the calculation is pessimistic, which means that most of the time
- * UBIFS reports less space than it actually has.
- */
-static inline long long ubifs_reported_space(const struct ubifs_info *c,
- uint64_t free)
-{
- int divisor, factor;
-
- divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz * 3);
- factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ;
- do_div(free, divisor);
-
- return free * factor;
-}
-
-/**
* ubifs_current_time - round current time to time granularity.
* @inode: inode
*/
@@ -325,4 +293,21 @@
current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
}
+/**
+ * ubifs_tnc_lookup - look up a file-system node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ *
+ * This function look up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure.
+ */
+static inline int ubifs_tnc_lookup(struct ubifs_info *c,
+ const union ubifs_key *key, void *node)
+{
+ return ubifs_tnc_locate(c, key, node, NULL, NULL);
+}
+
#endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f71e6b8..7562464 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -370,8 +370,9 @@
{
struct ubifs_info *c = dentry->d_sb->s_fs_info;
unsigned long long free;
+ __le32 *uuid = (__le32 *)c->uuid;
- free = ubifs_budg_get_free_space(c);
+ free = ubifs_get_free_space(c);
dbg_gen("free space %lld bytes (%lld blocks)",
free, free >> UBIFS_BLOCK_SHIFT);
@@ -386,7 +387,8 @@
buf->f_files = 0;
buf->f_ffree = 0;
buf->f_namelen = UBIFS_MAX_NLEN;
-
+ buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
+ buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
return 0;
}
@@ -530,6 +532,12 @@
c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
+ /*
+ * Calculate how many bytes would be wasted at the end of LEB if it was
+ * fully filled with data nodes of maximum size. This is used in
+ * calculations when reporting free space.
+ */
+ c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
return 0;
}
@@ -647,13 +655,11 @@
* internally because it does not make much sense for UBIFS, but it is
* necessary to report something for the 'statfs()' call.
*
- * Subtract the LEB reserved for GC and the LEB which is reserved for
- * deletions.
- *
- * Review 'ubifs_calc_available()' if changing this calculation.
+ * Subtract the LEB reserved for GC, the LEB which is reserved for
+ * deletions, and assume only one journal head is available.
*/
- tmp64 = c->main_lebs - 2;
- tmp64 *= (uint64_t)c->leb_size - c->dark_wm;
+ tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1;
+ tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;
tmp64 = ubifs_reported_space(c, tmp64);
c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e909f4a..7da209a 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -506,7 +506,7 @@
if (keys_cmp(c, key, &node_key) != 0)
ret = 0;
}
- if (ret == 0)
+ if (ret == 0 && c->replaying)
dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
return ret;
@@ -1382,50 +1382,39 @@
}
/**
- * ubifs_tnc_lookup - look up a file-system node.
+ * maybe_leb_gced - determine if a LEB may have been garbage collected.
* @c: UBIFS file-system description object
- * @key: node key to lookup
- * @node: the node is returned here
+ * @lnum: LEB number
+ * @gc_seq1: garbage collection sequence number
*
- * This function look up and reads node with key @key. The caller has to make
- * sure the @node buffer is large enough to fit the node. Returns zero in case
- * of success, %-ENOENT if the node was not found, and a negative error code in
- * case of failure.
+ * This function determines if @lnum may have been garbage collected since
+ * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise
+ * %0 is returned.
*/
-int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
- void *node)
+static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
{
- int found, n, err;
- struct ubifs_znode *znode;
- struct ubifs_zbranch zbr, *zt;
+ int gc_seq2, gced_lnum;
- mutex_lock(&c->tnc_mutex);
- found = ubifs_lookup_level0(c, key, &znode, &n);
- if (!found) {
- err = -ENOENT;
- goto out;
- } else if (found < 0) {
- err = found;
- goto out;
- }
- zt = &znode->zbranch[n];
- if (is_hash_key(c, key)) {
- /*
- * In this case the leaf node cache gets used, so we pass the
- * address of the zbranch and keep the mutex locked
- */
- err = tnc_read_node_nm(c, zt, node);
- goto out;
- }
- zbr = znode->zbranch[n];
- mutex_unlock(&c->tnc_mutex);
-
- err = ubifs_tnc_read_node(c, &zbr, node);
- return err;
-
-out:
- mutex_unlock(&c->tnc_mutex);
- return err;
+ gced_lnum = c->gced_lnum;
+ smp_rmb();
+ gc_seq2 = c->gc_seq;
+ /* Same seq means no GC */
+ if (gc_seq1 == gc_seq2)
+ return 0;
+ /* Different by more than 1 means we don't know */
+ if (gc_seq1 + 1 != gc_seq2)
+ return 1;
+ /*
+ * We have seen the sequence number has increased by 1. Now we need to
+ * be sure we read the right LEB number, so read it again.
+ */
+ smp_rmb();
+ if (gced_lnum != c->gced_lnum)
+ return 1;
+ /* Finally we can check lnum */
+ if (gced_lnum == lnum)
+ return 1;
+ return 0;
}
/**
@@ -1436,16 +1425,19 @@
* @lnum: LEB number is returned here
* @offs: offset is returned here
*
- * This function is the same as 'ubifs_tnc_lookup()' but it returns the node
- * location also. See 'ubifs_tnc_lookup()'.
+ * This function look up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure. The node location can be returned in @lnum and @offs.
*/
int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
void *node, int *lnum, int *offs)
{
- int found, n, err;
+ int found, n, err, safely = 0, gc_seq1;
struct ubifs_znode *znode;
struct ubifs_zbranch zbr, *zt;
+again:
mutex_lock(&c->tnc_mutex);
found = ubifs_lookup_level0(c, key, &znode, &n);
if (!found) {
@@ -1456,24 +1448,43 @@
goto out;
}
zt = &znode->zbranch[n];
+ if (lnum) {
+ *lnum = zt->lnum;
+ *offs = zt->offs;
+ }
if (is_hash_key(c, key)) {
/*
* In this case the leaf node cache gets used, so we pass the
* address of the zbranch and keep the mutex locked
*/
- *lnum = zt->lnum;
- *offs = zt->offs;
err = tnc_read_node_nm(c, zt, node);
goto out;
}
+ if (safely) {
+ err = ubifs_tnc_read_node(c, zt, node);
+ goto out;
+ }
+ /* Drop the TNC mutex prematurely and race with garbage collection */
zbr = znode->zbranch[n];
+ gc_seq1 = c->gc_seq;
mutex_unlock(&c->tnc_mutex);
- *lnum = zbr.lnum;
- *offs = zbr.offs;
+ if (ubifs_get_wbuf(c, zbr.lnum)) {
+ /* We do not GC journal heads */
+ err = ubifs_tnc_read_node(c, &zbr, node);
+ return err;
+ }
- err = ubifs_tnc_read_node(c, &zbr, node);
- return err;
+ err = fallible_read_node(c, key, &zbr, node);
+ if (maybe_leb_gced(c, zbr.lnum, gc_seq1)) {
+ /*
+ * The node may have been GC'ed out from under us so try again
+ * while keeping the TNC mutex locked.
+ */
+ safely = 1;
+ goto again;
+ }
+ return 0;
out:
mutex_unlock(&c->tnc_mutex);
@@ -1498,7 +1509,6 @@
{
int found, n, err;
struct ubifs_znode *znode;
- struct ubifs_zbranch zbr;
dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
mutex_lock(&c->tnc_mutex);
@@ -1522,11 +1532,7 @@
goto out_unlock;
}
- zbr = znode->zbranch[n];
- mutex_unlock(&c->tnc_mutex);
-
- err = tnc_read_node_nm(c, &zbr, node);
- return err;
+ err = tnc_read_node_nm(c, &znode->zbranch[n], node);
out_unlock:
mutex_unlock(&c->tnc_mutex);
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index bd2121f..a9ecbd9 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -87,7 +87,7 @@
#define UBIFS_SK_LEN 8
/* Minimum index tree fanout */
-#define UBIFS_MIN_FANOUT 2
+#define UBIFS_MIN_FANOUT 3
/* Maximum number of levels in UBIFS indexing B-tree */
#define UBIFS_MAX_LEVELS 512
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index d7f706f..17c620b 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -995,6 +995,9 @@
* @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
* @max_inode_sz: maximum possible inode size in bytes
* @max_znode_sz: size of znode in bytes
+ *
+ * @leb_overhead: how many bytes are wasted in an LEB when it is filled with
+ * data nodes of maximum size - used in free space reporting
* @dead_wm: LEB dead space watermark
* @dark_wm: LEB dark space watermark
* @block_cnt: count of 4KiB blocks on the FS
@@ -1028,6 +1031,8 @@
* @sbuf: a buffer of LEB size used by GC and replay for scanning
* @idx_gc: list of index LEBs that have been garbage collected
* @idx_gc_cnt: number of elements on the idx_gc list
+ * @gc_seq: incremented for every non-index LEB garbage collected
+ * @gced_lnum: last non-index LEB that was garbage collected
*
* @infos_list: links all 'ubifs_info' objects
* @umount_mutex: serializes shrinker and un-mount
@@ -1224,6 +1229,8 @@
int max_idx_node_sz;
long long max_inode_sz;
int max_znode_sz;
+
+ int leb_overhead;
int dead_wm;
int dark_wm;
int block_cnt;
@@ -1257,6 +1264,8 @@
void *sbuf;
struct list_head idx_gc;
int idx_gc_cnt;
+ volatile int gc_seq;
+ volatile int gced_lnum;
struct list_head infos_list;
struct mutex umount_mutex;
@@ -1434,9 +1443,10 @@
struct ubifs_budget_req *req);
void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
struct ubifs_budget_req *req);
-long long ubifs_budg_get_free_space(struct ubifs_info *c);
+long long ubifs_get_free_space(struct ubifs_info *c);
int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
void ubifs_convert_page_budget(struct ubifs_info *c);
+long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);
long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
/* find.c */
@@ -1451,8 +1461,6 @@
/* tnc.c */
int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode **zn, int *n);
-int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
- void *node);
int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
void *node, const struct qstr *nm);
int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 8feeae1..79a7ff9 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -14,4 +14,10 @@
extern char __initdata_begin[], __initdata_end[];
extern char __start_rodata[], __end_rodata[];
+/* function descriptor handling (if any). Override
+ * in asm/sections.h */
+#ifndef dereference_function_descriptor
+#define dereference_function_descriptor(p) (p)
+#endif
+
#endif /* _ASM_GENERIC_SECTIONS_H_ */
diff --git a/include/asm-parisc/sections.h b/include/asm-parisc/sections.h
index fdd43ec..9d13c35 100644
--- a/include/asm-parisc/sections.h
+++ b/include/asm-parisc/sections.h
@@ -4,4 +4,9 @@
/* nothing to see, move along */
#include <asm-generic/sections.h>
+#ifdef CONFIG_64BIT
+#undef dereference_function_descriptor
+void *dereference_function_descriptor(void *);
+#endif
+
#endif
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index d8d1d11..c399bc1 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -27,6 +27,7 @@
#include <asm/page.h> /* for PAGE_SIZE */
#include <asm/div64.h>
+#include <asm/sections.h> /* for dereference_function_descriptor() */
/* Works only for digits and letters, but small and fast */
#define TOLOWER(x) ((x) | 0x20)
@@ -513,16 +514,6 @@
return buf;
}
-static inline void *dereference_function_descriptor(void *ptr)
-{
-#if defined(CONFIG_IA64) || defined(CONFIG_PPC64)
- void *p;
- if (!probe_kernel_address(ptr, p))
- ptr = p;
-#endif
- return ptr;
-}
-
static char *symbol_string(char *buf, char *end, void *ptr, int field_width, int precision, int flags)
{
unsigned long value = (unsigned long) ptr;