Race condition / TOCTOU in MPTCP read path (potential use-after-free or memory corruption)
Description
The commit contains multiple MPTCP fixes. The security-relevant portion is a race/TOCTOU hazard in the MPTCP read path (read_sock) where data/descriptor state could become inconsistent under concurrent reads, potentially allowing memory-safety issues (e.g., use-after-free or invalid access). Patch 9 specifically adds a check in the read loop: after consuming a received skb, if the read descriptor count (desc->count) is zero, the loop breaks, ensuring the code does not continue processing with an exhausted descriptor. This helps prevent a window where the kernel could operate on stale or freed data while another thread/process is draining data. The rest of the changes address related TOCTOU/race vectors and memory safety (e.g., rcv_wnd computation in DSS, cleared flags, and safer subflow signaling), collectively reducing security risk. Overall, this is a genuine vulnerability fix for a race condition in the MPTCP read path with MEDIUM confidence.
Commit Details
Author: Jakub Kicinski
Date: 2026-06-04 02:04 UTC
Message:
Merge branch 'mptcp-misc-fixes-for-v7-1-rc7'
Matthieu Baerts says:
====================
mptcp: misc fixes for v7.1-rc7
Here are various unrelated fixes:
- Patch 1: fix missing wakeups when multiple threads are reading from
the same fd. A fix for v5.7.
- Patch 2: fix retransmission loop when MPTCP checksum is enabled. A fix
for v5.14.
- Patch 3: fix a TOCTOU race while computing rcv_wnd. A fix for v5.11.
- Patch 4: allow subflows receive window to shrink if needed. A fix for
v5.19.
- Patches 5-6: avoid 'extra_subflows' to underflow with the userspace
PM. A fix for v5.19.
- Patch 7: report errors if one subflow cannot set SO_TIMESTAMPING. A
fix for v5.14.
- Patch 8: try to set TCP_MAXSEG on all subflows, before reporting
errors, if any. A fix for v6.17.
- Patch 9: check desc->count in read_sock, to act as expected. A fix
for v7.0.
- Patch 10: fix an uninit value in mptcp_established_options, reported
by syzbot. A fix for v7.1-rc1.
- Patch 11: fix a similar issue than the previous patch, exposed by the
same modification from v7.1-rc1, but was already causing issues since
v5.15.
====================
Link: https://patch.msgid.link/20260602-net-mptcp-misc-fixes-7-1-rc7-v2-0-856831229976@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Triage Assessment
Vulnerability Type: Race condition
Confidence: MEDIUM
Reasoning:
The patch set includes fixes for TOCTOU race conditions and an uninitialized value related to established options, both of which can have security implications (timing/race exploits, memory safety). While the commit is a collection of various MPTCP stabilize fixes, several changes address potential security-relevant races and undefined behavior.
Verification Assessment
Vulnerability Type: Race condition / TOCTOU in MPTCP read path (potential use-after-free or memory corruption)
Confidence: MEDIUM
Affected Versions: v7.0-rc6 through v7.1-rc6 (inclusive); fixed in v7.1-rc7
Code Diff
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index f7263fe2a2e40b5..ee70f597a4de8fb 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -27,7 +27,9 @@ struct mptcp_ext {
u32 subflow_seq;
u16 data_len;
__sum16 csum;
- u8 use_map:1,
+
+ struct_group(flags,
+ u8 use_map:1,
dsn64:1,
data_fin:1,
use_ack:1,
@@ -35,9 +37,10 @@ struct mptcp_ext {
mpc_map:1,
frozen:1,
reset_transient:1;
- u8 reset_reason:4,
+ u8 reset_reason:4,
csum_reqd:1,
infinite_map:1;
+ ); /* end of flags group */
};
#define MPTCPOPT_HMAC_LEN 20
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 8a1c5698983cff3..b3ea7854818fde8 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -566,12 +566,17 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct tcp_sock *tp = tcp_sk(sk);
unsigned int dss_size = 0;
struct mptcp_ext *mpext;
unsigned int ack_size;
bool ret = false;
- u64 ack_seq;
+ /* Zero `use_ack` and `use_map` flags with one shot. */
+ BUILD_BUG_ON(sizeof_field(struct mptcp_ext, flags) != sizeof(u16));
+ BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct mptcp_ext, flags),
+ sizeof(u16)));
+ *(u16 *)&opts->ext_copy.flags = 0;
opts->csum_reqd = READ_ONCE(msk->csum_enabled);
mpext = skb ? mptcp_get_ext(skb) : NULL;
@@ -595,20 +600,16 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
/* passive sockets msk will set the 'can_ack' after accept(), even
* if the first subflow may have the already the remote key handy
*/
- opts->ext_copy.use_ack = 0;
if (!READ_ONCE(msk->can_ack)) {
*size = ALIGN(dss_size, 4);
return ret;
}
- ack_seq = READ_ONCE(msk->ack_seq);
if (READ_ONCE(msk->use_64bit_ack)) {
ack_size = TCPOLEN_MPTCP_DSS_ACK64;
- opts->ext_copy.data_ack = ack_seq;
opts->ext_copy.ack64 = 1;
} else {
ack_size = TCPOLEN_MPTCP_DSS_ACK32;
- opts->ext_copy.data_ack32 = (uint32_t)ack_seq;
opts->ext_copy.ack64 = 0;
}
opts->ext_copy.use_ack = 1;
@@ -618,6 +619,12 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
if (dss_size == 0)
ack_size += TCPOLEN_MPTCP_DSS_BASE;
+ /* The caller is __tcp_transmit_skb(), and will compute the new rcv
+ * wnd soon: ensure that the window can shrink.
+ */
+ if (skb)
+ tp->rcv_wnd = tp->rcv_nxt - tp->rcv_wup;
+
dss_size += ack_size;
*size = ALIGN(dss_size, 4);
@@ -658,7 +665,6 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
- bool drop_other_suboptions = false;
unsigned int opt_size = *size;
struct mptcp_addr_info addr;
bool echo;
@@ -669,36 +675,20 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
*/
if (!mptcp_pm_should_add_signal(msk) ||
(opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) ||
- !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &addr,
- &echo, &drop_other_suboptions))
+ !skb || !skb_is_tcp_pure_ack(skb) ||
+ !mptcp_pm_add_addr_signal(msk, opt_size, remaining, &addr, &echo))
return false;
- /*
- * Later on, mptcp_write_options() will enforce mutually exclusion with
- * DSS, bail out if such option is set and we can't drop it.
- */
- if (drop_other_suboptions)
- remaining += opt_size;
- else if (opts->suboptions & OPTION_MPTCP_DSS)
- return false;
+ remaining += opt_size;
len = mptcp_add_addr_len(addr.family, echo, !!addr.port);
if (remaining < len)
return false;
*size = len;
- if (drop_other_suboptions) {
- pr_debug("drop other suboptions\n");
- opts->suboptions = 0;
-
- /* note that e.g. DSS could have written into the memory
- * aliased by ahmac, we must reset the field here
- * to avoid appending the hmac even for ADD_ADDR echo
- * options
- */
- opts->ahmac = 0;
- *size -= opt_size;
- }
+ pr_debug("drop other suboptions\n");
+ opts->suboptions = 0;
+ *size -= opt_size;
opts->addr = addr;
opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
if (!echo) {
@@ -708,6 +698,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
&opts->addr);
} else {
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX);
+ opts->ahmac = 0;
}
pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d\n",
opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port));
@@ -1297,19 +1288,14 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
return true;
}
-static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
+static u64 mptcp_set_rwin(struct mptcp_sock *msk, struct tcp_sock *tp,
+ struct tcphdr *th, u64 ack_seq)
{
const struct sock *ssk = (const struct sock *)tp;
- struct mptcp_subflow_context *subflow;
- u64 ack_seq, rcv_wnd_old, rcv_wnd_new;
- struct mptcp_sock *msk;
+ u64 rcv_wnd_old, rcv_wnd_new;
u32 new_win;
u64 win;
- subflow = mptcp_subflow_ctx(ssk);
- msk = mptcp_sk(subflow->conn);
-
- ack_seq = READ_ONCE(msk->ack_seq);
rcv_wnd_new = ack_seq + tp->rcv_wnd;
rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent);
@@ -1362,7 +1348,7 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
update_wspace:
WRITE_ONCE(msk->old_wspace, tp->rcv_wnd);
- subflow->rcv_wnd_sent = rcv_wnd_new;
+ return rcv_wnd_new;
}
static void mptcp_track_rwin(struct tcp_sock *tp)
@@ -1474,13 +1460,25 @@ void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp,
*ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
if (mpext->use_ack) {
+ struct mptcp_sock *msk;
+ u64 ack_seq;
+
+ /* DSS option is set only by mptcp_established_options,
+ * the caller is __tcp_transmit_skb() and ssk is always
+ * not NULL.
+ */
+ subflow = mptcp_subflow_ctx(ssk);
+ msk = mptcp_sk(subflow->conn);
+ ack_seq = READ_ONCE(msk->ack_seq);
if (mpext->ack64) {
- put_unaligned_be64(mpext->data_ack, ptr);
+ put_unaligned_be64(ack_seq, ptr);
ptr += 2;
} else {
- put_unaligned_be32(mpext->data_ack32, ptr);
+ put_unaligned_be32(ack_seq, ptr);
ptr += 1;
}
+ subflow->rcv_wnd_sent = mptcp_set_rwin(msk, tp, th,
+ ack_seq);
}
if (mpext->use_map) {
@@ -1708,9 +1706,6 @@ void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp,
i += 4;
}
}
-
- if (tp)
- mptcp_set_rwin(tp, th);
}
__be32 mptcp_get_reset_option(const struct sk_buff *skb)
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 3e770c7407e1fd3..470501470fe5430 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -887,10 +887,9 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
}
}
-bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
- unsigned int opt_size, unsigned int remaining,
- struct mptcp_addr_info *addr, bool *echo,
- bool *drop_other_suboptions)
+bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int opt_size,
+ unsigned int remaining,
+ struct mptcp_addr_info *addr, bool *echo)
{
bool skip_add_addr = false;
int ret = false;
@@ -908,10 +907,7 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
* plain dup-ack from TCP perspective. The other MPTCP-relevant info,
* if any, will be carried by the 'original' TCP ack
*/
- if (skb && skb_is_tcp_pure_ack(skb)) {
- remaining += opt_size;
- *drop_other_suboptions = true;
- }
+ remaining += opt_size;
*echo = mptcp_pm_should_add_signal_echo(msk);
if (*echo) {
@@ -929,9 +925,6 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
if (remaining < mptcp_add_addr_len(family, *echo, port)) {
struct net *net = sock_net((struct sock *)msk);
- if (!*drop_other_suboptions)
- goto out_unlock;
-
if (*echo) {
MPTCP_INC_STATS(net, MPTCP_MIB_ECHOADDTXDROP);
} else {
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
index 8cbc1920afb492a..0d3a95e676f17d0 100644
--- a/net/mptcp/pm_userspace.c
+++ b/net/mptcp/pm_userspace.c
@@ -408,19 +408,21 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info)
local.flags = entry.flags;
local.ifindex = entry.ifindex;
+ spin_lock_bh(&msk->pm.lock);
+ msk->pm.extra_subflows++;
+ spin_unlock_bh(&msk->pm.lock);
+
lock_sock(sk);
err = __mptcp_subflow_connect(sk, &local, &addr_r);
release_sock(sk);
- if (err)
+ if (err) {
GENL_SET_ERR_MSG_FMT(info, "connect error: %d", err);
- spin_lock_bh(&msk->pm.lock);
- if (err)
+ spin_lock_bh(&msk->pm.lock);
mptcp_userspace_pm_delete_local_addr(msk, &entry);
- else
- msk->pm.extra_subflows++;
- spin_unlock_bh(&msk->pm.lock);
+ spin_unlock_bh(&msk->pm.lock);
+ }
create_err:
sock_put(sk);
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index a72a6ad6ee8b1da..cb9515f505aa4e0 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2276,6 +2276,10 @@ static bool mptcp_move_skbs(struct sock *sk)
mptcp_backlog_spooled(sk, moved, &skbs);
}
mptcp_data_unlock(sk);
+
+ if (enqueued && mptcp_epollin_ready(sk))
+ sk->sk_data_ready(sk);
+
return enqueued;
}
@@ -2865,6 +2869,10 @@ static void __mptcp_retrans(struct sock *sk)
msk->bytes_retrans += len;
dfrag->already_sent = max(dfrag->already_sent, len);
+ /* With csum enabled retransmission can send new data. */
+ if (after64(dfrag->already_sent + dfrag->data_seq, msk->snd_nxt))
+ WRITE_ONCE(msk->snd_nxt, dfrag->already_sent + dfrag->data_seq);
+
reset_timer:
mptcp_check_and_set_pending(sk);
@@ -4420,6 +4428,8 @@ static int __mptcp_read_sock(struct sock *sk, read_descriptor_t *desc,
}
mptcp_eat_recv_skb(sk, skb);
+ if (!desc->count)
+ break;
}
if (noack)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index e4f5aba24da7db9..b93b878478d2620 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -1229,10 +1229,9 @@ static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list)
return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1;
}
-bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
- unsigned int opt_size, unsigned int remaining,
- struct mptcp_addr_info *addr, bool *echo,
- bool *drop_other_suboptions);
+bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int opt_size,
+ unsigned int remaining,
+ struct mptcp_addr_info *addr, bool *echo);
bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
struct mptcp_rm_list *rm_list);
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 87b5796d0135342..fcf6feb2a9eb5db 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -241,15 +241,19 @@ static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk,
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int err;
lock_sock(ssk);
- sock_set_timestamping(ssk, optname, timestamping);
+ err = sock_set_timestamping(ssk, optname, timestamping);
release_sock(ssk);
+
+ if (err < 0 && ret == 0)
+ ret = err;
}
release_sock(sk);
- return 0;
+ return ret;
}
static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock *msk, sockptr_t optval,
@@ -813,10 +817,11 @@ static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level,
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int err;
- ret = tcp_setsockopt(ssk, level, optname, optval, optlen);
- if (ret)
- break;
+ err = tcp_setsockopt(ssk, level, optname, optval, optlen);
+ if (err < 0 && ret == 0)
+ ret = err;
}
if (!ret)
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 5acd12021e6e8b1..4b3f71e666092ea 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -4100,6 +4100,10 @@ userspace_tests()
chk_rm_nr 0 1
chk_mptcp_info subflows 0 subflows 0
chk_subflows_total 1 1
+ # check counters are not affected by errors at creation time
+ userspace_pm_add_sf $ns2 10.0.12.2 10 2>/dev/null
+ chk_mptcp_info subflows 0 subflows 0
+ chk_subflows_total 1 1
kill_events_pids
mptcp_lib_kill_group_wait $tests_pid
fi