rebase patches

and drop those applied in 4.14/4.15
This commit is contained in:
Fabian Grünbichler 2018-03-09 14:43:58 +01:00
parent 55f9bfa990
commit 15baf5b4c2
29 changed files with 20 additions and 1844 deletions

View file

@ -20,10 +20,10 @@ Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
index fd8fdb91581d..1e35ac9fc810 100755
index 87f1fc9801d7..4ef868f1f244 100755
--- a/scripts/mkcompile_h
+++ b/scripts/mkcompile_h
@@ -37,10 +37,14 @@ else
@@ -33,10 +33,14 @@ else
VERSION=$KBUILD_BUILD_VERSION
fi

View file

@ -18,7 +18,7 @@ Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 89110319ef0f..5e73fff65f47 100644
index 808e2b914015..b0ad54384826 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -259,10 +259,7 @@ bool br_stp_recalculate_bridge_id(struct net_bridge *br)

View file

@ -74,7 +74,7 @@ index 27ca3fbc47aa..5e3caff3fb49 100644
Safety option to keep boot IRQs enabled. This
should never be necessary.
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index db82bef43b99..d338fdb7c402 100644
index db82bef43b99..ed94ba0d0922 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3695,6 +3695,106 @@ static int __init pci_apply_final_quirks(void)

View file

@ -12,7 +12,7 @@ Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4d81f6ded88e..bfa9c4d34102 100644
index 210bf820385a..5b7e582f3742 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -77,7 +77,7 @@ module_param(halt_poll_ns, uint, 0644);

View file

@ -1,66 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 17 Aug 2017 15:33:09 -0400
Subject: [PATCH] cgroup: Add mount flag to enable cpuset to use v2 behavior in
v1 cgroup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
A new mount option "cpuset_v2_mode" is added to the v1 cgroupfs
filesystem to enable cpuset controller to use v2 behavior in a v1
cgroup. This mount option applies only to cpuset controller and have
no effect on other controllers.
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
(cherry-picked from e1cba4b85daa71b710384d451ff6238d5e4d1ff6)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
include/linux/cgroup-defs.h | 5 +++++
kernel/cgroup/cgroup-v1.c | 6 ++++++
2 files changed, 11 insertions(+)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 09f4c7df1478..c344e77707a5 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -74,6 +74,11 @@ enum {
* aren't writeable from inside the namespace.
*/
CGRP_ROOT_NS_DELEGATE = (1 << 3),
+
+ /*
+ * Enable cpuset controller in v1 cgroup to use v2 behavior.
+ */
+ CGRP_ROOT_CPUSET_V2_MODE = (1 << 4),
};
/* cftype->flags */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 7bf4b1533f34..ce7426b875f5 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -846,6 +846,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
seq_puts(seq, ",xattr");
+ if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
+ seq_puts(seq, ",cpuset_v2_mode");
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
@@ -900,6 +902,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
opts->cpuset_clone_children = true;
continue;
}
+ if (!strcmp(token, "cpuset_v2_mode")) {
+ opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
+ continue;
+ }
if (!strcmp(token, "xattr")) {
opts->flags |= CGRP_ROOT_XATTR;
continue;
--
2.14.2

View file

@ -54,7 +54,7 @@ Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 989a900383b5..e1a3ae4f3cab 100644
index 6a38c2503649..91813e686c67 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -514,22 +514,20 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,

View file

@ -1,141 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 17 Aug 2017 15:33:10 -0400
Subject: [PATCH] cpuset: Allow v2 behavior in v1 cgroup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Cpuset v2 has some useful behaviors that are not present in v1 because
of backward compatibility concern. One of that is the restoration of
the original cpu and memory node mask after a hot removal and addition
event sequence.
This patch makes the cpuset controller to check the
CGRP_ROOT_CPUSET_V2_MODE flag and use the v2 behavior if it is set.
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
(cherry-picked from b8d1b8ee93df8ffbabbeadd65d39853cfad6d698)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
kernel/cgroup/cpuset.c | 33 ++++++++++++++++++++-------------
1 file changed, 20 insertions(+), 13 deletions(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e8cb34193433..f76c4bf3d46a 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -299,6 +299,16 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
+/*
+ * Cgroup v2 behavior is used when on default hierarchy or the
+ * cgroup_v2_mode flag is set.
+ */
+static inline bool is_in_v2_mode(void)
+{
+ return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
+}
+
/*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
@@ -489,8 +499,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- !is_cpuset_subset(trial, par))
+ if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
goto out;
/*
@@ -896,8 +905,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some CPUs.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- cpumask_empty(new_cpus))
+ if (is_in_v2_mode() && cpumask_empty(new_cpus))
cpumask_copy(new_cpus, parent->effective_cpus);
/* Skip the whole subtree if the cpumask remains the same. */
@@ -914,7 +922,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
cpumask_copy(cp->effective_cpus, new_cpus);
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
update_tasks_cpumask(cp);
@@ -1150,8 +1158,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some MEMs.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- nodes_empty(*new_mems))
+ if (is_in_v2_mode() && nodes_empty(*new_mems))
*new_mems = parent->effective_mems;
/* Skip the whole subtree if the nodemask remains the same. */
@@ -1168,7 +1175,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
cp->effective_mems = *new_mems;
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ WARN_ON(!is_in_v2_mode() &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
update_tasks_nodemask(cp);
@@ -1460,7 +1467,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
/* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ if (!is_in_v2_mode() &&
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
@@ -1979,7 +1986,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
spin_lock_irq(&callback_lock);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
}
@@ -2056,7 +2063,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
mutex_lock(&cpuset_mutex);
spin_lock_irq(&callback_lock);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ if (is_in_v2_mode()) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
top_cpuset.mems_allowed = node_possible_map;
} else {
@@ -2250,7 +2257,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs)
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ if (is_in_v2_mode())
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
else
@@ -2288,7 +2295,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
- bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+ bool on_dfl = is_in_v2_mode();
mutex_lock(&cpuset_mutex);
--
2.14.2

View file

@ -27,7 +27,7 @@ Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index a4b6ffb61495..c5a5ad8ac00f 100644
index 5d4c15bf66d2..e35d4f73d2df 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -326,8 +326,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,

View file

@ -1,90 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Thu, 30 Nov 2017 19:05:45 +0100
Subject: [PATCH] KVM: x86: fix APIC page invalidation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Implementation of the unpinned APIC page didn't update the VMCS address
cache when invalidation was done through range mmu notifiers.
This became a problem when the page notifier was removed.
Re-introduce the arch-specific helper and call it from ...range_start.
Fixes: 38b9917350cb ("kvm: vmx: Implement set_apic_access_page_addr")
Fixes: 369ea8242c0f ("mm/rmap: update to new mmu_notifier semantic v2")
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/kvm_host.h | 3 +++
arch/x86/kvm/x86.c | 14 ++++++++++++++
virt/kvm/kvm_main.c | 8 ++++++++
3 files changed, 25 insertions(+)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 78ec3cda9429..1953c0a5b972 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1439,4 +1439,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
#endif
}
+void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+ unsigned long start, unsigned long end);
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f896c441fc2c..eae4aecf3cfe 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6711,6 +6711,20 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
kvm_x86_ops->tlb_flush(vcpu);
}
+void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+ unsigned long start, unsigned long end)
+{
+ unsigned long apic_address;
+
+ /*
+ * The physical address of apic access page is stored in the VMCS.
+ * Update it when it becomes invalid.
+ */
+ apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ if (start <= apic_address && apic_address < end)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+}
+
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
struct page *page = NULL;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index bfa9c4d34102..d0085c9d6297 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -136,6 +136,11 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
static unsigned long long kvm_createvm_count;
static unsigned long long kvm_active_vms;
+__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+ unsigned long start, unsigned long end)
+{
+}
+
bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
{
if (pfn_valid(pfn))
@@ -361,6 +366,9 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock);
+
+ kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
+
srcu_read_unlock(&kvm->srcu, idx);
}
--
2.14.2

View file

@ -28,10 +28,10 @@ Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 88a31e9340a0..77ec9b495027 100644
index d1516327b787..256986aca8df 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -784,6 +784,7 @@ struct ocfs2_write_ctxt {
@@ -797,6 +797,7 @@ struct ocfs2_write_ctxt {
struct ocfs2_cached_dealloc_ctxt w_dealloc;
struct list_head w_unwritten_list;
@ -39,7 +39,7 @@ index 88a31e9340a0..77ec9b495027 100644
};
void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1373,6 +1374,7 @@ static int ocfs2_unwritten_check(struct inode *inode,
@@ -1386,6 +1387,7 @@ static int ocfs2_unwritten_check(struct inode *inode,
desc->c_clear_unwritten = 0;
list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
list_add_tail(&new->ue_node, &wc->w_unwritten_list);
@ -47,7 +47,7 @@ index 88a31e9340a0..77ec9b495027 100644
new = NULL;
unlock:
spin_unlock(&oi->ip_lock);
@@ -2246,7 +2248,7 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
@@ -2256,7 +2258,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
ue->ue_phys = desc->c_phys;
list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);

View file

@ -72,7 +72,7 @@ Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
3 files changed, 203 insertions(+), 10 deletions(-)
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 4a5152ec88a3..571692171dd1 100644
index 27b75cf32cfa..250bcacdf9e9 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
@ -84,7 +84,7 @@ index 4a5152ec88a3..571692171dd1 100644
/*
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 386aecce881d..9b5e7d8ba710 100644
index ab5105f9767e..2f2c76193f54 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
@ -109,7 +109,7 @@ index 386aecce881d..9b5e7d8ba710 100644
et->et_ops->eo_fill_root_el(et);
if (!et->et_ops->eo_fill_max_leaf_clusters)
@@ -1159,7 +1167,7 @@ static int ocfs2_add_branch(handle_t *handle,
@@ -1158,7 +1166,7 @@ static int ocfs2_add_branch(handle_t *handle,
struct buffer_head **last_eb_bh,
struct ocfs2_alloc_context *meta_ac)
{
@ -118,7 +118,7 @@ index 386aecce881d..9b5e7d8ba710 100644
u64 next_blkno, new_last_eb_blk;
struct buffer_head *bh;
struct buffer_head **new_eb_bhs = NULL;
@@ -1214,11 +1222,31 @@ static int ocfs2_add_branch(handle_t *handle,
@@ -1213,11 +1221,31 @@ static int ocfs2_add_branch(handle_t *handle,
goto bail;
}
@ -155,7 +155,7 @@ index 386aecce881d..9b5e7d8ba710 100644
}
/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
@@ -1341,15 +1369,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
@@ -1340,15 +1368,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head **ret_new_eb_bh)
{
@ -184,7 +184,7 @@ index 386aecce881d..9b5e7d8ba710 100644
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1512,7 +1550,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
@@ -1511,7 +1549,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
int depth = le16_to_cpu(el->l_tree_depth);
struct buffer_head *bh = NULL;
@ -193,7 +193,7 @@ index 386aecce881d..9b5e7d8ba710 100644
shift = ocfs2_find_branch_target(et, &bh);
if (shift < 0) {
@@ -6593,6 +6631,154 @@ ocfs2_find_per_slot_free_list(int type,
@@ -6585,6 +6623,154 @@ ocfs2_find_per_slot_free_list(int type,
return fl;
}
@ -349,10 +349,10 @@ index 386aecce881d..9b5e7d8ba710 100644
int type, int slot, u64 suballoc,
u64 blkno, unsigned int bit)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 77ec9b495027..2ff02dda97d8 100644
index 256986aca8df..e8e205bf2e41 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2322,6 +2322,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
@@ -2332,6 +2332,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);

View file

@ -1,72 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Wei Xu <wexu@redhat.com>
Date: Fri, 1 Dec 2017 05:10:36 -0500
Subject: [PATCH] vhost: fix skb leak in handle_rx()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Matthew found a roughly 40% tcp throughput regression with commit
c67df11f(vhost_net: try batch dequing from skb array) as discussed
in the following thread:
https://www.mail-archive.com/netdev@vger.kernel.org/msg187936.html
Eventually we figured out that it was a skb leak in handle_rx()
when sending packets to the VM. This usually happens when a guest
can not drain out vq as fast as vhost fills in, afterwards it sets
off the traffic jam and leaks skb(s) which occurs as no headcount
to send on the vq from vhost side.
This can be avoided by making sure we have got enough headcount
before actually consuming a skb from the batched rx array while
transmitting, which is simply done by moving checking the zero
headcount a bit ahead.
Signed-off-by: Wei Xu <wexu@redhat.com>
Reported-by: Matthew Rosato <mjrosato@linux.vnet.ibm.com>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
drivers/vhost/net.c | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 1c75572f5a3f..010253847022 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -781,16 +781,6 @@ static void handle_rx(struct vhost_net *net)
/* On error, stop handling until the next kick. */
if (unlikely(headcount < 0))
goto out;
- if (nvq->rx_array)
- msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
- /* On overrun, truncate and discard */
- if (unlikely(headcount > UIO_MAXIOV)) {
- iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
- err = sock->ops->recvmsg(sock, &msg,
- 1, MSG_DONTWAIT | MSG_TRUNC);
- pr_debug("Discarded rx packet: len %zd\n", sock_len);
- continue;
- }
/* OK, now we need to know about added descriptors. */
if (!headcount) {
if (unlikely(vhost_enable_notify(&net->dev, vq))) {
@@ -803,6 +793,16 @@ static void handle_rx(struct vhost_net *net)
* they refilled. */
goto out;
}
+ if (nvq->rx_array)
+ msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
+ /* On overrun, truncate and discard */
+ if (unlikely(headcount > UIO_MAXIOV)) {
+ iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
+ err = sock->ops->recvmsg(sock, &msg,
+ 1, MSG_DONTWAIT | MSG_TRUNC);
+ pr_debug("Discarded rx packet: len %zd\n", sock_len);
+ continue;
+ }
/* We don't need to be notified again. */
iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
fixup = msg.msg_iter;
--
2.14.2

View file

@ -1,86 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Wei Xu <wexu@redhat.com>
Date: Fri, 1 Dec 2017 05:10:37 -0500
Subject: [PATCH] tun: free skb in early errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
tun_recvmsg() supports accepting skb by msg_control after
commit ac77cfd4258f ("tun: support receiving skb through msg_control"),
the skb if presented should be freed no matter how far it can go
along, otherwise it would be leaked.
This patch fixes several missed cases.
Signed-off-by: Wei Xu <wexu@redhat.com>
Reported-by: Matthew Rosato <mjrosato@linux.vnet.ibm.com>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
drivers/net/tun.c | 24 ++++++++++++++++++------
1 file changed, 18 insertions(+), 6 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d1cb1ff83251..d58ae8ad0a4e 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1519,8 +1519,11 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
tun_debug(KERN_INFO, tun, "tun_do_read\n");
- if (!iov_iter_count(to))
+ if (!iov_iter_count(to)) {
+ if (skb)
+ kfree_skb(skb);
return 0;
+ }
if (!skb) {
/* Read frames from ring */
@@ -1636,22 +1639,24 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
{
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
struct tun_struct *tun = __tun_get(tfile);
+ struct sk_buff *skb = m->msg_control;
int ret;
- if (!tun)
- return -EBADFD;
+ if (!tun) {
+ ret = -EBADFD;
+ goto out_free_skb;
+ }
if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
ret = -EINVAL;
- goto out;
+ goto out_put_tun;
}
if (flags & MSG_ERRQUEUE) {
ret = sock_recv_errqueue(sock->sk, m, total_len,
SOL_PACKET, TUN_TX_TIMESTAMP);
goto out;
}
- ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT,
- m->msg_control);
+ ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, skb);
if (ret > (ssize_t)total_len) {
m->msg_flags |= MSG_TRUNC;
ret = flags & MSG_TRUNC ? ret : total_len;
@@ -1659,6 +1664,13 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
out:
tun_put(tun);
return ret;
+
+out_put_tun:
+ tun_put(tun);
+out_free_skb:
+ if (skb)
+ kfree_skb(skb);
+ return ret;
}
static int tun_peek_len(struct socket *sock)
--
2.14.2

View file

@ -1,58 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Wei Xu <wexu@redhat.com>
Date: Fri, 1 Dec 2017 05:10:38 -0500
Subject: [PATCH] tap: free skb if flags error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
tap_recvmsg() supports accepting skb by msg_control after
commit 3b4ba04acca8 ("tap: support receiving skb from msg_control"),
the skb if presented should be freed within the function, otherwise
it would be leaked.
Signed-off-by: Wei Xu <wexu@redhat.com>
Reported-by: Matthew Rosato <mjrosato@linux.vnet.ibm.com>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
drivers/net/tap.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 7a2f6bebfd15..96e5e5b2ae39 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -829,8 +829,11 @@ static ssize_t tap_do_read(struct tap_queue *q,
DEFINE_WAIT(wait);
ssize_t ret = 0;
- if (!iov_iter_count(to))
+ if (!iov_iter_count(to)) {
+ if (skb)
+ kfree_skb(skb);
return 0;
+ }
if (skb)
goto put;
@@ -1157,11 +1160,14 @@ static int tap_recvmsg(struct socket *sock, struct msghdr *m,
size_t total_len, int flags)
{
struct tap_queue *q = container_of(sock, struct tap_queue, sock);
+ struct sk_buff *skb = m->msg_control;
int ret;
- if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
+ if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) {
+ if (skb)
+ kfree_skb(skb);
return -EINVAL;
- ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT,
- m->msg_control);
+ }
+ ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, skb);
if (ret > total_len) {
m->msg_flags |= MSG_TRUNC;
ret = flags & MSG_TRUNC ? ret : total_len;
--
2.14.2

View file

@ -1,93 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Fri, 5 Jan 2018 23:51:12 +0100
Subject: [PATCH] IB/core: Avoid crash on pkey enforcement failed in received
MADs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
commit 89548bcafec7ecfeea58c553f0834b5d575a66eb upstream.
Below kernel crash is observed when Pkey security enforcement fails on
received MADs. This issue is reported in [1].
ib_free_recv_mad() accesses the rmpp_list, whose initialization is
needed before accessing it.
When security enformcent fails on received MADs, MAD processing avoided
due to security checks failed.
OpenSM[3770]: SM port is down
kernel: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
kernel: IP: ib_free_recv_mad+0x44/0xa0 [ib_core]
kernel: PGD 0
kernel: P4D 0
kernel:
kernel: Oops: 0002 [#1] SMP
kernel: CPU: 0 PID: 2833 Comm: kworker/0:1H Tainted: P IO 4.13.4-1-pve #1
kernel: Hardware name: Dell XS23-TY3 /9CMP63, BIOS 1.71 09/17/2013
kernel: Workqueue: ib-comp-wq ib_cq_poll_work [ib_core]
kernel: task: ffffa069c6541600 task.stack: ffffb9a729054000
kernel: RIP: 0010:ib_free_recv_mad+0x44/0xa0 [ib_core]
kernel: RSP: 0018:ffffb9a729057d38 EFLAGS: 00010286
kernel: RAX: ffffa069cb138a48 RBX: ffffa069cb138a10 RCX: 0000000000000000
kernel: RDX: ffffb9a729057d38 RSI: 0000000000000000 RDI: ffffa069cb138a20
kernel: RBP: ffffb9a729057d60 R08: ffffa072d2d49800 R09: ffffa069cb138ae0
kernel: R10: ffffa069cb138ae0 R11: ffffa072b3994e00 R12: ffffb9a729057d38
kernel: R13: ffffa069d1c90000 R14: 0000000000000000 R15: ffffa069d1c90880
kernel: FS: 0000000000000000(0000) GS:ffffa069dba00000(0000) knlGS:0000000000000000
kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
kernel: CR2: 0000000000000008 CR3: 00000011f51f2000 CR4: 00000000000006f0
kernel: Call Trace:
kernel: ib_mad_recv_done+0x5cc/0xb50 [ib_core]
kernel: __ib_process_cq+0x5c/0xb0 [ib_core]
kernel: ib_cq_poll_work+0x20/0x60 [ib_core]
kernel: process_one_work+0x1e9/0x410
kernel: worker_thread+0x4b/0x410
kernel: kthread+0x109/0x140
kernel: ? process_one_work+0x410/0x410
kernel: ? kthread_create_on_node+0x70/0x70
kernel: ? SyS_exit_group+0x14/0x20
kernel: ret_from_fork+0x25/0x30
kernel: RIP: ib_free_recv_mad+0x44/0xa0 [ib_core] RSP: ffffb9a729057d38
kernel: CR2: 0000000000000008
[1] : https://www.spinics.net/lists/linux-rdma/msg56190.html
Fixes: 47a2b338fe63 ("IB/core: Enforce security on management datagrams")
Signed-off-by: Parav Pandit <parav@mellanox.com>
Reported-by: Chris Blake <chrisrblake93@gmail.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Reviewed-by: Hal Rosenstock <hal@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
drivers/infiniband/core/mad.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index f8f53bb90837..cb91245e9163 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -1974,14 +1974,15 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
unsigned long flags;
int ret;
+ INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
ret = ib_mad_enforce_security(mad_agent_priv,
mad_recv_wc->wc->pkey_index);
if (ret) {
ib_free_recv_mad(mad_recv_wc);
deref_mad_agent(mad_agent_priv);
+ return;
}
- INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
--
2.14.2

View file

@ -1,47 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Daniel Jurgens <danielj@mellanox.com>
Date: Mon, 20 Nov 2017 16:47:45 -0600
Subject: [PATCH] IB/core: Don't enforce PKey security on SMI MADs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Per the infiniband spec an SMI MAD can have any PKey. Checking the pkey
on SMI MADs is not necessary, and it seems that some older adapters
using the mthca driver don't follow the convention of using the default
PKey, resulting in false denials, or errors querying the PKey cache.
SMI MAD security is still enforced, only agents allowed to manage the
subnet are able to receive or send SMI MADs.
Reported-by: Chris Blake <chrisrblake93@gmail.com>
Fixes: 47a2b338fe63("IB/core: Enforce security on management datagrams")
Signed-off-by: Daniel Jurgens <danielj@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
drivers/infiniband/core/security.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
index 70ad19c4c73e..8f9fd3b757db 100644
--- a/drivers/infiniband/core/security.c
+++ b/drivers/infiniband/core/security.c
@@ -692,8 +692,11 @@ int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index)
{
int ret;
- if (map->agent.qp->qp_type == IB_QPT_SMI && !map->agent.smp_allowed)
- return -EACCES;
+ if (map->agent.qp->qp_type == IB_QPT_SMI) {
+ if (!map->agent.smp_allowed)
+ return -EACCES;
+ return 0;
+ }
ret = ib_security_pkey_access(map->agent.device,
map->agent.port_num,
--
2.14.2

View file

@ -1,299 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 6 Nov 2017 13:31:12 +0100
Subject: [PATCH] kvm: vmx: Reinstate support for CPUs without virtual NMI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
commit 8a1b43922d0d1279e7936ba85c4c2a870403c95f upstream.
This is more or less a revert of commit 2c82878b0cb3 ("KVM: VMX: require
virtual NMI support", 2017-03-27); it turns out that Core 2 Duo machines
only had virtual NMIs in some SKUs.
The revert is not trivial because in the meanwhile there have been several
fixes to nested NMI injection. Therefore, the entire vNMI state is moved
to struct loaded_vmcs.
Another change compared to before the patch is a simplification here:
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
!(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
get_vmcs12(vcpu))))) {
The final condition here is always true (because nested_cpu_has_virtual_nmis
is always false) and is removed.
Fixes: 2c82878b0cb38fd516fd612c67852a6bbf282003
Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1490803
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kvm/vmx.c | 150 +++++++++++++++++++++++++++++++++++++----------------
1 file changed, 106 insertions(+), 44 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5edf05ce45de..146caacd8fdd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -204,6 +204,10 @@ struct loaded_vmcs {
bool nmi_known_unmasked;
unsigned long vmcs_host_cr3; /* May not match real cr3 */
unsigned long vmcs_host_cr4; /* May not match real cr4 */
+ /* Support for vnmi-less CPUs */
+ int soft_vnmi_blocked;
+ ktime_t entry_time;
+ s64 vnmi_blocked_time;
struct list_head loaded_vmcss_on_cpu_link;
};
@@ -1290,6 +1294,11 @@ static inline bool cpu_has_vmx_invpcid(void)
SECONDARY_EXEC_ENABLE_INVPCID;
}
+static inline bool cpu_has_virtual_nmis(void)
+{
+ return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
+}
+
static inline bool cpu_has_vmx_wbinvd_exit(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -1341,11 +1350,6 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
(vmcs12->secondary_vm_exec_control & bit);
}
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
-{
- return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
-}
-
static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
{
return vmcs12->pin_based_vm_exec_control &
@@ -3687,9 +3691,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
&_vmexit_control) < 0)
return -EIO;
- min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
- PIN_BASED_VIRTUAL_NMIS;
- opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER;
+ min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+ opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+ PIN_BASED_VMX_PREEMPTION_TIMER;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
&_pin_based_exec_control) < 0)
return -EIO;
@@ -5549,7 +5553,8 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
static void enable_nmi_window(struct kvm_vcpu *vcpu)
{
- if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+ if (!cpu_has_virtual_nmis() ||
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
enable_irq_window(vcpu);
return;
}
@@ -5589,6 +5594,19 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (!cpu_has_virtual_nmis()) {
+ /*
+ * Tracking the NMI-blocked state in software is built upon
+ * finding the next open IRQ window. This, in turn, depends on
+ * well-behaving guests: They have to keep IRQs disabled at
+ * least as long as the NMI handler runs. Otherwise we may
+ * cause NMI nesting, maybe breaking the guest. But as this is
+ * highly unlikely, we can live with the residual risk.
+ */
+ vmx->loaded_vmcs->soft_vnmi_blocked = 1;
+ vmx->loaded_vmcs->vnmi_blocked_time = 0;
+ }
+
++vcpu->stat.nmi_injections;
vmx->loaded_vmcs->nmi_known_unmasked = false;
@@ -5607,6 +5625,8 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool masked;
+ if (!cpu_has_virtual_nmis())
+ return vmx->loaded_vmcs->soft_vnmi_blocked;
if (vmx->loaded_vmcs->nmi_known_unmasked)
return false;
masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
@@ -5618,13 +5638,20 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- vmx->loaded_vmcs->nmi_known_unmasked = !masked;
- if (masked)
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- else
- vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
+ if (!cpu_has_virtual_nmis()) {
+ if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
+ vmx->loaded_vmcs->soft_vnmi_blocked = masked;
+ vmx->loaded_vmcs->vnmi_blocked_time = 0;
+ }
+ } else {
+ vmx->loaded_vmcs->nmi_known_unmasked = !masked;
+ if (masked)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ }
}
static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -5632,6 +5659,10 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
if (to_vmx(vcpu)->nested.nested_run_pending)
return 0;
+ if (!cpu_has_virtual_nmis() &&
+ to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
+ return 0;
+
return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
(GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
| GUEST_INTR_STATE_NMI));
@@ -6360,6 +6391,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
* AAK134, BY25.
*/
if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ cpu_has_virtual_nmis() &&
(exit_qualification & INTR_INFO_UNBLOCK_NMI))
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
@@ -6834,7 +6866,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
}
/* Create a new VMCS */
- item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
+ item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
if (!item)
return NULL;
item->vmcs02.vmcs = alloc_vmcs();
@@ -7851,6 +7883,7 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
* "blocked by NMI" bit has to be set before next VM entry.
*/
if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ cpu_has_virtual_nmis() &&
(exit_qualification & INTR_INFO_UNBLOCK_NMI))
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
@@ -8568,6 +8601,25 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
return 0;
}
+ if (unlikely(!cpu_has_virtual_nmis() &&
+ vmx->loaded_vmcs->soft_vnmi_blocked)) {
+ if (vmx_interrupt_allowed(vcpu)) {
+ vmx->loaded_vmcs->soft_vnmi_blocked = 0;
+ } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
+ vcpu->arch.nmi_pending) {
+ /*
+ * This CPU don't support us in finding the end of an
+ * NMI-blocked window if the guest runs with IRQs
+ * disabled. So we pull the trigger after 1 s of
+ * futile waiting, but inform the user about this.
+ */
+ printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+ "state on VCPU %d after 1 s timeout\n",
+ __func__, vcpu->vcpu_id);
+ vmx->loaded_vmcs->soft_vnmi_blocked = 0;
+ }
+ }
+
if (exit_reason < kvm_vmx_max_exit_handlers
&& kvm_vmx_exit_handlers[exit_reason])
return kvm_vmx_exit_handlers[exit_reason](vcpu);
@@ -8850,33 +8902,38 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
- if (vmx->loaded_vmcs->nmi_known_unmasked)
- return;
- /*
- * Can't use vmx->exit_intr_info since we're not sure what
- * the exit reason is.
- */
- exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
- vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
- /*
- * SDM 3: 27.7.1.2 (September 2008)
- * Re-set bit "block by NMI" before VM entry if vmexit caused by
- * a guest IRET fault.
- * SDM 3: 23.2.2 (September 2008)
- * Bit 12 is undefined in any of the following cases:
- * If the VM exit sets the valid bit in the IDT-vectoring
- * information field.
- * If the VM exit is due to a double fault.
- */
- if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
- vector != DF_VECTOR && !idtv_info_valid)
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- else
- vmx->loaded_vmcs->nmi_known_unmasked =
- !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
- & GUEST_INTR_STATE_NMI);
+ if (cpu_has_virtual_nmis()) {
+ if (vmx->loaded_vmcs->nmi_known_unmasked)
+ return;
+ /*
+ * Can't use vmx->exit_intr_info since we're not sure what
+ * the exit reason is.
+ */
+ exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+ vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+ /*
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Re-set bit "block by NMI" before VM entry if vmexit caused by
+ * a guest IRET fault.
+ * SDM 3: 23.2.2 (September 2008)
+ * Bit 12 is undefined in any of the following cases:
+ * If the VM exit sets the valid bit in the IDT-vectoring
+ * information field.
+ * If the VM exit is due to a double fault.
+ */
+ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+ vector != DF_VECTOR && !idtv_info_valid)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmx->loaded_vmcs->nmi_known_unmasked =
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
+ & GUEST_INTR_STATE_NMI);
+ } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
+ vmx->loaded_vmcs->vnmi_blocked_time +=
+ ktime_to_ns(ktime_sub(ktime_get(),
+ vmx->loaded_vmcs->entry_time));
}
static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
@@ -8993,6 +9050,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long debugctlmsr, cr3, cr4;
+ /* Record the guest's net vcpu time for enforced NMI injections. */
+ if (unlikely(!cpu_has_virtual_nmis() &&
+ vmx->loaded_vmcs->soft_vnmi_blocked))
+ vmx->loaded_vmcs->entry_time = ktime_get();
+
/* Don't enter VMX if guest state is invalid, let the exit handler
start emulation until we arrive back to a valid state */
if (vmx->emulation_required)
--
2.14.2

View file

@ -1,56 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 26 Oct 2017 09:13:27 +0200
Subject: [PATCH] KVM: SVM: obey guest PAT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
For many years some users of assigned devices have reported worse
performance on AMD processors with NPT than on AMD without NPT,
Intel or bare metal.
The reason turned out to be that SVM is discarding the guest PAT
setting and uses the default (PA0=PA4=WB, PA1=PA5=WT, PA2=PA6=UC-,
PA3=UC). The guest might be using a different setting, and
especially might want write combining but isn't getting it
(instead getting slow UC or UC- accesses).
Thanks a lot to geoff@hostfission.com for noticing the relation
to the g_pat setting. The patch has been tested also by a bunch
of people on VFIO users forums.
Fixes: 709ddebf81cb40e3c36c6109a7892e8b93a09464
Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=196409
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Tested-by: Nick Sarnie <commendsarnex@gmail.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
(cherry picked from commit 15038e14724799b8c205beb5f20f9e54896013c3)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kvm/svm.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a8c911fcd73f..e9d0f80fd83a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3650,6 +3650,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
u32 ecx = msr->index;
u64 data = msr->data;
switch (ecx) {
+ case MSR_IA32_CR_PAT:
+ if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+ return 1;
+ vcpu->arch.pat = data;
+ svm->vmcb->save.g_pat = data;
+ mark_dirty(svm->vmcb, VMCB_NPT);
+ break;
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr);
break;
--
2.14.2

View file

@ -1,33 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Wolfgang Bumiller <w.bumiller@proxmox.com>
Date: Fri, 19 Jan 2018 11:12:37 +0100
Subject: [PATCH] net: sched: em_nbyte: don't add the data offset twice
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
'ptr' is shifted by the offset and then validated,
the memcmp should not add it a second time.
Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
net/sched/em_nbyte.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
index df3110d69585..07c10bac06a0 100644
--- a/net/sched/em_nbyte.c
+++ b/net/sched/em_nbyte.c
@@ -51,7 +51,7 @@ static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
return 0;
- return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len);
+ return !memcmp(ptr, nbyte->pattern, nbyte->hdr.len);
}
static struct tcf_ematch_ops em_nbyte_ops = {
--
2.14.2

View file

@ -1,34 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Wolfgang Bumiller <w.bumiller@proxmox.com>
Date: Fri, 19 Jan 2018 11:12:38 +0100
Subject: [PATCH] net: sched: fix TCF_LAYER_LINK case in tcf_get_base_ptr
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
TCF_LAYER_LINK and TCF_LAYER_NETWORK returned the same pointer as
skb->data points to the network header.
Use skb_mac_header instead.
Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
include/net/pkt_cls.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 537d0a0ad4c4..4450961b1554 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -395,7 +395,7 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer)
{
switch (layer) {
case TCF_LAYER_LINK:
- return skb->data;
+ return skb_mac_header(skb);
case TCF_LAYER_NETWORK:
return skb_network_header(skb);
case TCF_LAYER_TRANSPORT:
--
2.14.2

View file

@ -1,127 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Wed, 4 Oct 2017 08:44:43 -0700
Subject: [PATCH] i40e: Fix memory leak related filter programming status
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
It looks like we weren't correctly placing the pages from buffers that had
been used to return a filter programming status back on the ring. As a
result they were being overwritten and tracking of the pages was lost.
This change works to correct that by incorporating part of
i40e_put_rx_buffer into the programming status handler code. As a result we
should now be correctly placing the pages for those buffers on the
re-allocation list instead of letting them stay in place.
Fixes: 0e626ff7ccbf ("i40e: Fix support for flow director programming status")
Reported-by: Anders K. Pedersen <akp@cohaesio.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Anders K Pedersen <akp@cohaesio.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
(cherry picked from commit 2b9478ffc550f17c6cd8c69057234e91150f5972)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
drivers/net/ethernet/intel/i40e/i40e_txrx.c | 63 ++++++++++++++++-------------
1 file changed, 36 insertions(+), 27 deletions(-)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 2194960d5855..391b1878c24b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1042,6 +1042,32 @@ static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc)
return false;
}
+/**
+ * i40e_reuse_rx_page - page flip buffer and store it back on the ring
+ * @rx_ring: rx descriptor ring to store buffers on
+ * @old_buff: donor buffer to have page reused
+ *
+ * Synchronizes page for reuse by the adapter
+ **/
+static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
+ struct i40e_rx_buffer *old_buff)
+{
+ struct i40e_rx_buffer *new_buff;
+ u16 nta = rx_ring->next_to_alloc;
+
+ new_buff = &rx_ring->rx_bi[nta];
+
+ /* update, and store next to alloc */
+ nta++;
+ rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
+
+ /* transfer page from old buffer to new buffer */
+ new_buff->dma = old_buff->dma;
+ new_buff->page = old_buff->page;
+ new_buff->page_offset = old_buff->page_offset;
+ new_buff->pagecnt_bias = old_buff->pagecnt_bias;
+}
+
/**
* i40e_rx_is_programming_status - check for programming status descriptor
* @qw: qword representing status_error_len in CPU ordering
@@ -1076,15 +1102,24 @@ static void i40e_clean_programming_status(struct i40e_ring *rx_ring,
union i40e_rx_desc *rx_desc,
u64 qw)
{
- u32 ntc = rx_ring->next_to_clean + 1;
+ struct i40e_rx_buffer *rx_buffer;
+ u32 ntc = rx_ring->next_to_clean;
u8 id;
/* fetch, update, and store next to clean */
+ rx_buffer = &rx_ring->rx_bi[ntc++];
ntc = (ntc < rx_ring->count) ? ntc : 0;
rx_ring->next_to_clean = ntc;
prefetch(I40E_RX_DESC(rx_ring, ntc));
+ /* place unused page back on the ring */
+ i40e_reuse_rx_page(rx_ring, rx_buffer);
+ rx_ring->rx_stats.page_reuse_count++;
+
+ /* clear contents of buffer_info */
+ rx_buffer->page = NULL;
+
id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT;
@@ -1643,32 +1678,6 @@ static bool i40e_cleanup_headers(struct i40e_ring *rx_ring, struct sk_buff *skb,
return false;
}
-/**
- * i40e_reuse_rx_page - page flip buffer and store it back on the ring
- * @rx_ring: rx descriptor ring to store buffers on
- * @old_buff: donor buffer to have page reused
- *
- * Synchronizes page for reuse by the adapter
- **/
-static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *old_buff)
-{
- struct i40e_rx_buffer *new_buff;
- u16 nta = rx_ring->next_to_alloc;
-
- new_buff = &rx_ring->rx_bi[nta];
-
- /* update, and store next to alloc */
- nta++;
- rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
- /* transfer page from old buffer to new buffer */
- new_buff->dma = old_buff->dma;
- new_buff->page = old_buff->page;
- new_buff->page_offset = old_buff->page_offset;
- new_buff->pagecnt_bias = old_buff->pagecnt_bias;
-}
-
/**
* i40e_page_is_reusable - check if any reuse is possible
* @page: page struct to check
--
2.14.2

View file

@ -1,49 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Andrew Honig <ahonig@google.com>
Date: Wed, 10 Jan 2018 10:12:03 -0800
Subject: [PATCH] KVM: x86: Add memory barrier on vmcs field lookup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
commit 75f139aaf896d6fdeec2e468ddfa4b2fe469bf40 upstream.
This adds a memory barrier when performing a lookup into
the vmcs_field_to_offset_table. This is related to
CVE-2017-5753.
Signed-off-by: Andrew Honig <ahonig@google.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kvm/vmx.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 146caacd8fdd..80732f87cac0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -883,8 +883,16 @@ static inline short vmcs_field_to_offset(unsigned long field)
{
BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
- if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
- vmcs_field_to_offset_table[field] == 0)
+ if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
+ return -ENOENT;
+
+ /*
+ * FIXME: Mitigation for CVE-2017-5753. To be replaced with a
+ * generic mechanism.
+ */
+ asm("lfence");
+
+ if (vmcs_field_to_offset_table[field] == 0)
return -ENOENT;
return vmcs_field_to_offset_table[field];
--
2.14.2

View file

@ -1,102 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Wed, 13 Sep 2017 18:42:14 +0800
Subject: [PATCH] EDAC, sb_edac: Don't create a second memory controller if HA1
is not present
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Yi Zhang reported the following failure on a 2-socket Haswell (E5-2603v3)
server (DELL PowerEdge 730xd):
EDAC sbridge: Some needed devices are missing
EDAC MC: Removed device 0 for sb_edac.c Haswell SrcID#0_Ha#0: DEV 0000:7f:12.0
EDAC MC: Removed device 1 for sb_edac.c Haswell SrcID#1_Ha#0: DEV 0000:ff:12.0
EDAC sbridge: Couldn't find mci handler
EDAC sbridge: Couldn't find mci handler
EDAC sbridge: Failed to register device with error -19.
The refactored sb_edac driver creates the IMC1 (the 2nd memory
controller) if any IMC1 device is present. In this case only
HA1_TA of IMC1 was present, but the driver expected to find
HA1/HA1_TM/HA1_TAD[0-3] devices too, leading to the above failure.
The document [1] says the 'E5-2603 v3' CPU has 4 memory channels max. Yi
Zhang inserted one DIMM per channel for each CPU, and did random error
address injection test with this patch:
4024 addresses fell in TOLM hole area
12715 addresses fell in CPU_SrcID#0_Ha#0_Chan#0_DIMM#0
12774 addresses fell in CPU_SrcID#0_Ha#0_Chan#1_DIMM#0
12798 addresses fell in CPU_SrcID#0_Ha#0_Chan#2_DIMM#0
12913 addresses fell in CPU_SrcID#0_Ha#0_Chan#3_DIMM#0
12674 addresses fell in CPU_SrcID#1_Ha#0_Chan#0_DIMM#0
12686 addresses fell in CPU_SrcID#1_Ha#0_Chan#1_DIMM#0
12882 addresses fell in CPU_SrcID#1_Ha#0_Chan#2_DIMM#0
12934 addresses fell in CPU_SrcID#1_Ha#0_Chan#3_DIMM#0
106400 addresses were injected totally.
The test result shows that all the 4 channels belong to IMC0 per CPU, so
the server really only has one IMC per CPU.
In the 1st page of chapter 2 in datasheet [2], it also says 'E5-2600 v3'
implements either one or two IMCs. For CPUs with one IMC, IMC1 is not
used and should be ignored.
Thus, do not create a second memory controller if the key HA1 is absent.
[1] http://ark.intel.com/products/83349/Intel-Xeon-Processor-E5-2603-v3-15M-Cache-1_60-GHz
[2] https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/xeon-e5-v3-datasheet-vol-2.pdf
Reported-and-tested-by: Yi Zhang <yizhan@redhat.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Fixes: e2f747b1f42a ("EDAC, sb_edac: Assign EDAC memory controller per h/w controller")
Link: http://lkml.kernel.org/r/20170913104214.7325-1-qiuxu.zhuo@intel.com
[ Massage commit message. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
(cherry picked from commit 15cc3ae001873845b5d842e212478a6570c7d938)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
drivers/edac/sb_edac.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index 80d860cb0746..7a3b201d51df 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -455,6 +455,7 @@ static const struct pci_id_table pci_dev_descr_sbridge_table[] = {
static const struct pci_id_descr pci_dev_descr_ibridge[] = {
/* Processor Home Agent */
{ PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1, 1, IMC1) },
/* Memory controller */
{ PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA, 0, IMC0) },
@@ -465,7 +466,6 @@ static const struct pci_id_descr pci_dev_descr_ibridge[] = {
{ PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD3, 0, IMC0) },
/* Optional, mode 2HA */
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1, 1, IMC1) },
{ PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TA, 1, IMC1) },
{ PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS, 1, IMC1) },
{ PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0, 1, IMC1) },
@@ -2260,6 +2260,13 @@ static int sbridge_get_onedevice(struct pci_dev **prev,
next_imc:
sbridge_dev = get_sbridge_dev(bus, dev_descr->dom, multi_bus, sbridge_dev);
if (!sbridge_dev) {
+ /* If the HA1 wasn't found, don't create EDAC second memory controller */
+ if (dev_descr->dom == IMC1 && devno != 1) {
+ edac_dbg(0, "Skip IMC1: %04x:%04x (since HA1 was absent)\n",
+ PCI_VENDOR_ID_INTEL, dev_descr->dev_id);
+ pci_dev_put(pdev);
+ return 0;
+ }
if (dev_descr->dom == SOCK)
goto out_imc;
--
2.14.2

View file

@ -1,37 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Mon, 16 Oct 2017 12:40:29 -0500
Subject: [PATCH] EDAC, sb_edac: Fix missing break in switch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add missing break statement in order to prevent the code from falling
through.
Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20171016174029.GA19757@embeddedor.com
Signed-off-by: Borislav Petkov <bp@suse.de>
(cherry picked from commit a8e9b186f153a44690ad0363a56716e7077ad28c)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
drivers/edac/sb_edac.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index 7a3b201d51df..fb0264ef83a3 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -2467,6 +2467,7 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci,
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA:
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TA:
pvt->pci_ta = pdev;
+ break;
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_RAS:
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS:
pvt->pci_ras = pdev;
--
2.14.2

View file

@ -1,45 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Dick Kennedy <dick.kennedy@broadcom.com>
Date: Wed, 23 Aug 2017 16:55:31 -0700
Subject: [PATCH] scsi: lpfc: Fix loop mode target discovery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The driver does not discover targets when in loop mode.
The NLP type is correctly getting set when a fabric connection is
detected but, not for loop. The unknown NLP type means that the driver
does not issue a PRLI when in loop topology. Thus target discovery
fails.
Fix by checking the topology during discovery. If it is loop, set the
NLP FC4 type to FCP.
Signed-off-by: Dick Kennedy <dick.kennedy@broadcom.com>
Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
(cherry picked from commit 2877cbffb79ed121a6bcc5edbe629d3aba36cd29)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
drivers/scsi/lpfc/lpfc_nportdisc.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c
index f74cb0142fd4..95b2b43ac37d 100644
--- a/drivers/scsi/lpfc/lpfc_nportdisc.c
+++ b/drivers/scsi/lpfc/lpfc_nportdisc.c
@@ -1724,6 +1724,9 @@ lpfc_cmpl_reglogin_reglogin_issue(struct lpfc_vport *vport,
lpfc_nvme_update_localport(vport);
}
+ } else if (phba->fc_topology == LPFC_TOPOLOGY_LOOP) {
+ ndlp->nlp_fc4_type |= NLP_FC4_FCP;
+
} else if (ndlp->nlp_fc4_type == 0) {
rc = lpfc_ns_cmd(vport, SLI_CTNS_GFT_ID,
0, ndlp->nlp_DID);
--
2.14.2

View file

@ -1,52 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 5 Dec 2017 23:15:31 -0800
Subject: [PATCH] sched/wait: Fix add_wait_queue() behavioral change
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The following cleanup commit:
50816c48997a ("sched/wait: Standardize internal naming of wait-queue entries")
... unintentionally changed the behavior of add_wait_queue() from
inserting the wait entry at the head of the wait queue to the tail
of the wait queue.
Beyond a negative performance impact this change in behavior
theoretically also breaks wait queues which mix exclusive and
non-exclusive waiters, as non-exclusive waiters will not be
woken up if they are queued behind enough exclusive waiters.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team@fb.com
Fixes: ("sched/wait: Standardize internal naming of wait-queue entries")
Link: http://lkml.kernel.org/r/a16c8ccffd39bd08fdaa45a5192294c784b803a7.1512544324.git.osandov@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit c6b9d9a33029014446bd9ed84c1688f6d3d4eab9)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
kernel/sched/wait.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index d6afed6d0752..c09ebe92a40a 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -27,7 +27,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&wq_head->lock, flags);
- __add_wait_queue_entry_tail(wq_head, wq_entry);
+ __add_wait_queue(wq_head, wq_entry);
spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue);
--
2.14.2

View file

@ -1,164 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 25 Jan 2018 15:50:28 -0800
Subject: [PATCH] module/retpoline: Warn about missing retpoline in module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There's a risk that a kernel which has full retpoline mitigations becomes
vulnerable when a module gets loaded that hasn't been compiled with the
right compiler or the right option.
To enable detection of that mismatch at module load time, add a module info
string "retpoline" at build time when the module was compiled with
retpoline support. This only covers compiled C source, but assembler source
or prebuilt object files are not checked.
If a retpoline enabled kernel detects a non retpoline protected module at
load time, print a warning and report it in the sysfs vulnerability file.
[ tglx: Massaged changelog ]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: gregkh@linuxfoundation.org
Cc: torvalds@linux-foundation.org
Cc: jeyu@kernel.org
Cc: arjan@linux.intel.com
Link: https://lkml.kernel.org/r/20180125235028.31211-1-andi@firstfloor.org
(backported from commit caf7501a1b4ec964190f31f9c3f163de252273b8)
Conflicts:
arch/x86/kernel/cpu/bugs.c
context changes
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
include/linux/module.h | 9 +++++++++
arch/x86/kernel/cpu/bugs.c | 19 +++++++++++++++++--
kernel/module.c | 11 +++++++++++
scripts/mod/modpost.c | 9 +++++++++
4 files changed, 46 insertions(+), 2 deletions(-)
diff --git a/include/linux/module.h b/include/linux/module.h
index e7bdd549e527..c4fdf7661f82 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -794,6 +794,15 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr,
static inline void module_bug_cleanup(struct module *mod) {}
#endif /* CONFIG_GENERIC_BUG */
+#ifdef RETPOLINE
+extern bool retpoline_module_ok(bool has_retpoline);
+#else
+static inline bool retpoline_module_ok(bool has_retpoline)
+{
+ return true;
+}
+#endif
+
#ifdef CONFIG_MODULE_SIG
static inline bool module_sig_ok(struct module *module)
{
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d5bafcdb4891..e623bd731a74 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -11,6 +11,7 @@
#include <linux/utsname.h>
#include <linux/cpu.h>
#include <linux/smp.h>
+#include <linux/module.h>
#include <asm/nospec-branch.h>
#include <asm/cmdline.h>
@@ -93,6 +94,19 @@ static const char *spectre_v2_strings[] = {
#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt
static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
+static bool spectre_v2_bad_module;
+
+#ifdef RETPOLINE
+bool retpoline_module_ok(bool has_retpoline)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
+ return true;
+
+ pr_err("System may be vunerable to spectre v2\n");
+ spectre_v2_bad_module = true;
+ return false;
+}
+#endif
static void __init spec2_print_if_insecure(const char *reason)
{
@@ -299,7 +313,8 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
return sprintf(buf, "Not affected\n");
- return sprintf(buf, "%s%s\n", spectre_v2_strings[spectre_v2_enabled],
- ibpb_inuse ? ", IBPB (Intel v4)" : "");
+ return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
+ ibpb_inuse ? ", IBPB (Intel v4)" : "",
+ spectre_v2_bad_module ? " - vulnerable module loaded" : "");
}
#endif
diff --git a/kernel/module.c b/kernel/module.c
index e5b878b26906..de7db074f793 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2855,6 +2855,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
}
#endif /* CONFIG_LIVEPATCH */
+static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
+{
+ if (retpoline_module_ok(get_modinfo(info, "retpoline")))
+ return;
+
+ pr_warn("%s: loading module not compiled with retpoline compiler.\n",
+ mod->name);
+}
+
/* Sets info->hdr and info->len. */
static int copy_module_from_user(const void __user *umod, unsigned long len,
struct load_info *info)
@@ -3021,6 +3030,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
}
+ check_modinfo_retpoline(mod, info);
+
if (get_modinfo(info, "staging")) {
add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
pr_warn("%s: module is from the staging directory, the quality "
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 48397feb08fb..cc91f81ac33e 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -2147,6 +2147,14 @@ static void add_intree_flag(struct buffer *b, int is_intree)
buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n");
}
+/* Cannot check for assembler */
+static void add_retpoline(struct buffer *b)
+{
+ buf_printf(b, "\n#ifdef RETPOLINE\n");
+ buf_printf(b, "MODULE_INFO(retpoline, \"Y\");\n");
+ buf_printf(b, "#endif\n");
+}
+
static void add_staging_flag(struct buffer *b, const char *name)
{
static const char *staging_dir = "drivers/staging";
@@ -2492,6 +2500,7 @@ int main(int argc, char **argv)
add_header(&buf, mod);
add_intree_flag(&buf, !external_module);
+ add_retpoline(&buf);
add_staging_flag(&buf, mod->name);
err |= add_versions(&buf, mod);
add_depends(&buf, mod, modules);
--
2.14.2

View file

@ -1,127 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Thu, 18 Jan 2018 16:14:26 -0500
Subject: [PATCH] net: tcp: close sock if net namespace is exiting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When a tcp socket is closed, if it detects that its net namespace is
exiting, close immediately and do not wait for FIN sequence.
For normal sockets, a reference is taken to their net namespace, so it will
never exit while the socket is open. However, kernel sockets do not take a
reference to their net namespace, so it may begin exiting while the kernel
socket is still open. In this case if the kernel socket is a tcp socket,
it will stay open trying to complete its close sequence. The sock's dst(s)
hold a reference to their interface, which are all transferred to the
namespace's loopback interface when the real interfaces are taken down.
When the namespace tries to take down its loopback interface, it hangs
waiting for all references to the loopback interface to release, which
results in messages like:
unregister_netdevice: waiting for lo to become free. Usage count = 1
These messages continue until the socket finally times out and closes.
Since the net namespace cleanup holds the net_mutex while calling its
registered pernet callbacks, any new net namespace initialization is
blocked until the current net namespace finishes exiting.
After this change, the tcp socket notices the exiting net namespace, and
closes immediately, releasing its dst(s) and their reference to the
loopback interface, which lets the net namespace continue exiting.
Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811
Signed-off-by: Dan Streetman <ddstreet@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
include/net/net_namespace.h | 10 ++++++++++
net/ipv4/tcp.c | 3 +++
net/ipv4/tcp_timer.c | 15 +++++++++++++++
3 files changed, 28 insertions(+)
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 1c401bd4c2e0..a5d023fa78db 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -221,6 +221,11 @@ int net_eq(const struct net *net1, const struct net *net2)
return net1 == net2;
}
+static inline int check_net(const struct net *net)
+{
+ return atomic_read(&net->count) != 0;
+}
+
void net_drop_ns(void *);
#else
@@ -245,6 +250,11 @@ int net_eq(const struct net *net1, const struct net *net2)
return 1;
}
+static inline int check_net(const struct net *net)
+{
+ return 1;
+}
+
#define net_drop_ns NULL
#endif
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a3e91b552edc..fd2a086da910 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2258,6 +2258,9 @@ void tcp_close(struct sock *sk, long timeout)
tcp_send_active_reset(sk, GFP_ATOMIC);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
+ } else if (!check_net(sock_net(sk))) {
+ /* Not possible to send reset; just close */
+ tcp_set_state(sk, TCP_CLOSE);
}
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index e906014890b6..ec1e5de41653 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -50,11 +50,19 @@ static void tcp_write_err(struct sock *sk)
* to prevent DoS attacks. It is called when a retransmission timeout
* or zero probe timeout occurs on orphaned socket.
*
+ * Also close if our net namespace is exiting; in that case there is no
+ * hope of ever communicating again since all netns interfaces are already
+ * down (or about to be down), and we need to release our dst references,
+ * which have been moved to the netns loopback interface, so the namespace
+ * can finish exiting. This condition is only possible if we are a kernel
+ * socket, as those do not hold references to the namespace.
+ *
* Criteria is still not confirmed experimentally and may change.
* We kill the socket, if:
* 1. If number of orphaned sockets exceeds an administratively configured
* limit.
* 2. If we have strong memory pressure.
+ * 3. If our net namespace is exiting.
*/
static int tcp_out_of_resources(struct sock *sk, bool do_reset)
{
@@ -83,6 +91,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
return 1;
}
+
+ if (!check_net(sock_net(sk))) {
+ /* Not possible to send reset; just close */
+ tcp_done(sk);
+ return 1;
+ }
+
return 0;
}
--
2.14.2

View file

@ -1,46 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 2 Nov 2017 13:03:42 +0300
Subject: [PATCH] lockd: lost rollback of set_grace_period() in
lockd_down_net()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Commit efda760fe95ea ("lockd: fix lockd shutdown race") is incorrect,
it removes lockd_manager and disarm grace_period_end for init_net only.
If nfsd was started from another net namespace lockd_up_net() calls
set_grace_period() that adds lockd_manager into per-netns list
and queues grace_period_end delayed work.
These action should be reverted in lockd_down_net().
Otherwise it can lead to double list_add on after restart nfsd in netns,
and to use-after-free if non-disarmed delayed work will be executed after netns destroy.
Fixes: efda760fe95e ("lockd: fix lockd shutdown race")
Cc: stable@vger.kernel.org
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
(cherry picked from commit 3a2b19d1ee5633f76ae8a88da7bc039a5d1732aa)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
fs/lockd/svc.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 726b6cecf430..fa8f6effcf00 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -274,6 +274,8 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
if (ln->nlmsvc_users) {
if (--ln->nlmsvc_users == 0) {
nlm_shutdown_hosts_net(net);
+ cancel_delayed_work_sync(&ln->grace_period_end);
+ locks_end_grace(&ln->lockd_manager);
svc_shutdown_net(serv, net);
dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
}
--
2.14.2