libceph: fix messenger retry
In ancient times, the messenger could both initiate and accept connections. An artifact if that was data structures to store/process an incoming ceph_msg_connect request and send an outgoing ceph_msg_connect_reply. Sadly, the negotiation code was referencing those structures and ignoring important information (like the peer's connect_seq) from the correct ones. Among other things, this fixes tight reconnect loops where the server sends RETRY_SESSION and we (the client) retries with the same connect_seq as last time. This bug pretty easily triggered by injecting socket failures on the MDS and running some fs workload like workunits/direct_io/test_sync_io. Signed-off-by: Sage Weil <sage@inktank.com>
This commit is contained in:
parent
cd43045c2d
commit
a16cb1f707
2 changed files with 8 additions and 16 deletions
|
|
@ -172,16 +172,8 @@ struct ceph_connection {
|
|||
|
||||
/* connection negotiation temps */
|
||||
char in_banner[CEPH_BANNER_MAX_LEN];
|
||||
union {
|
||||
struct { /* outgoing connection */
|
||||
struct ceph_msg_connect out_connect;
|
||||
struct ceph_msg_connect_reply in_reply;
|
||||
};
|
||||
struct { /* incoming */
|
||||
struct ceph_msg_connect in_connect;
|
||||
struct ceph_msg_connect_reply out_reply;
|
||||
};
|
||||
};
|
||||
struct ceph_msg_connect out_connect;
|
||||
struct ceph_msg_connect_reply in_reply;
|
||||
struct ceph_entity_addr actual_peer_addr;
|
||||
|
||||
/* message out temps */
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue