diff options
author | Priya Krishnan <Priya.Krishnan@Sun.COM> | 2010-05-10 17:03:07 -0400 |
---|---|---|
committer | Priya Krishnan <Priya.Krishnan@Sun.COM> | 2010-05-10 17:03:07 -0400 |
commit | d618d68dcf9c6c8f2c1e2fbbd4de1de0cf30150e (patch) | |
tree | 0e469c9dcd6535ebc3340e37ebe133d4a6a8d2d8 | |
parent | 71ed50cf049ab14d8e0ef8d48ba17d91223e81e7 (diff) | |
download | illumos-gate-d618d68dcf9c6c8f2c1e2fbbd4de1de0cf30150e.tar.gz |
6890586 RFE: Support iSCSI Multiple Connections per Session (MC/S) with COMSTAR iSCSI
-rw-r--r-- | usr/src/uts/common/io/comstar/port/iscsit/iscsit.c | 531 | ||||
-rw-r--r-- | usr/src/uts/common/io/comstar/port/iscsit/iscsit.h | 36 | ||||
-rw-r--r-- | usr/src/uts/common/io/comstar/port/iscsit/iscsit_sess.c | 10 | ||||
-rw-r--r-- | usr/src/uts/common/io/idm/idm.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/io/idm/idm_conn_sm.c | 6 | ||||
-rw-r--r-- | usr/src/uts/common/sys/idm/idm_impl.h | 6 |
6 files changed, 534 insertions, 59 deletions
diff --git a/usr/src/uts/common/io/comstar/port/iscsit/iscsit.c b/usr/src/uts/common/io/comstar/port/iscsit/iscsit.c index e68a50291d..a53ab0755a 100644 --- a/usr/src/uts/common/io/comstar/port/iscsit/iscsit.c +++ b/usr/src/uts/common/io/comstar/port/iscsit/iscsit.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/cpuvar.h> @@ -120,6 +119,30 @@ static idm_status_t iscsit_init(dev_info_t *dip); static idm_status_t iscsit_enable_svc(iscsit_hostinfo_t *hostinfo); static void iscsit_disable_svc(void); +static int +iscsit_check_cmdsn_and_queue(idm_pdu_t *rx_pdu); + +static void +iscsit_add_pdu_to_queue(iscsit_sess_t *ist, idm_pdu_t *rx_pdu); + +static idm_pdu_t * +iscsit_remove_pdu_from_queue(iscsit_sess_t *ist, uint32_t cmdsn); + +static void +iscsit_process_pdu_in_queue(iscsit_sess_t *ist); + +static void +iscsit_rxpdu_queue_monitor_session(iscsit_sess_t *ist); + +static void +iscsit_rxpdu_queue_monitor(void *arg); + +static void +iscsit_post_staged_pdu(idm_pdu_t *rx_pdu); + +static void +iscsit_post_scsi_cmd(idm_conn_t *ic, idm_pdu_t *rx_pdu); + static void iscsit_op_scsi_task_mgmt(iscsit_conn_t *ict, idm_pdu_t *rx_pdu); @@ -137,6 +160,9 @@ iscsit_pdu_op_logout_cmd(iscsit_conn_t *ict, idm_pdu_t *rx_pdu); int iscsit_cmd_window(); +static int +iscsit_sna_lt(uint32_t sn1, uint32_t sn2); + void iscsit_set_cmdsn(iscsit_conn_t *ict, idm_pdu_t *rx_pdu); @@ -216,6 +242,22 @@ static void iscsit_send_direct_scsi_resp(iscsit_conn_t *ict, idm_pdu_t *rx_pdu, static void iscsit_send_task_mgmt_resp(idm_pdu_t *tm_resp_pdu, uint8_t tm_status); +/* + * MC/S: Out-of-order commands are staged on a session-wide wait + * queue until a system-tunable threshold is reached. A separate + * thread is used to scan the staging queue on all the session, + * If a delayed PDU does not arrive within a timeout, the target + * will advance to the staged PDU that is next in sequence, skipping + * over the missing PDU(s) to go past a hole in the sequence. + */ +volatile int rxpdu_queue_threshold = ISCSIT_RXPDU_QUEUE_THRESHOLD; + +static kmutex_t iscsit_rxpdu_queue_monitor_mutex; +kthread_t *iscsit_rxpdu_queue_monitor_thr_id; +static kt_did_t iscsit_rxpdu_queue_monitor_thr_did; +static boolean_t iscsit_rxpdu_queue_monitor_thr_running; +static kcondvar_t iscsit_rxpdu_queue_monitor_cv; + int _init(void) { @@ -226,6 +268,12 @@ _init(void) MUTEX_DRIVER, NULL); iscsit_global.global_svc_state = ISE_DETACHED; + mutex_init(&iscsit_rxpdu_queue_monitor_mutex, NULL, + MUTEX_DRIVER, NULL); + iscsit_rxpdu_queue_monitor_thr_id = NULL; + iscsit_rxpdu_queue_monitor_thr_running = B_FALSE; + cv_init(&iscsit_rxpdu_queue_monitor_cv, NULL, CV_DEFAULT, NULL); + if ((rc = mod_install(&modlinkage)) != 0) { mutex_destroy(&iscsit_global.global_state_mutex); rw_destroy(&iscsit_global.global_rwlock); @@ -249,6 +297,8 @@ _fini(void) rc = mod_remove(&modlinkage); if (rc == 0) { + mutex_destroy(&iscsit_rxpdu_queue_monitor_mutex); + cv_destroy(&iscsit_rxpdu_queue_monitor_cv); mutex_destroy(&iscsit_global.global_state_mutex); rw_destroy(&iscsit_global.global_rwlock); } @@ -692,6 +742,9 @@ iscsit_enable_svc(iscsit_hostinfo_t *hostinfo) iscsit_global.global_dispatch_taskq = taskq_create("iscsit_dispatch", 1, minclsyspri, 16, 16, TASKQ_PREPOPULATE); + /* Scan staged PDUs, meaningful in MC/S situations */ + iscsit_rxpdu_queue_monitor_start(); + return (IDM_STATUS_SUCCESS); tear_down_and_return: @@ -753,6 +806,8 @@ iscsit_disable_svc(void) ASSERT(iscsit_global.global_svc_state == ISE_DISABLING); + iscsit_rxpdu_queue_monitor_stop(); + /* tear down discovery sessions */ for (sess = avl_first(&iscsit_global.global_discovery_sessions); sess != NULL; @@ -853,8 +908,10 @@ iscsit_rx_pdu(idm_conn_t *ic, idm_pdu_t *rx_pdu) idm_conn_event(ic, CE_TRANSPORT_FAIL, NULL); break; case ISCSI_OP_SCSI_TASK_MGT_MSG: - iscsit_set_cmdsn(ict, rx_pdu); - iscsit_op_scsi_task_mgmt(ict, rx_pdu); + if (iscsit_check_cmdsn_and_queue(rx_pdu)) { + iscsit_set_cmdsn(ict, rx_pdu); + iscsit_op_scsi_task_mgmt(ict, rx_pdu); + } break; case ISCSI_OP_NOOP_OUT: case ISCSI_OP_LOGIN_CMD: @@ -1034,10 +1091,9 @@ iscsit_build_hdr(idm_task_t *idm_task, idm_pdu_t *pdu, uint8_t opcode) iscsi_data_rsp_hdr_t *dh = (iscsi_data_rsp_hdr_t *)pdu->isp_hdr; /* - * We acquired iscsit_sess_t.ist_sn_rwlock in iscsit_xfer_scsi_data - * in reader mode so we expect to be locked here + * We acquired iscsit_sess_t.ist_sn_mutex in iscsit_xfer_scsi_data */ - + ASSERT(MUTEX_HELD(&itask->it_ict->ict_sess->ist_sn_mutex)); /* * Lun is only required if the opcode == ISCSI_OP_SCSI_DATA_RSP * and the 'A' bit is to be set @@ -1252,12 +1308,37 @@ iscsit_ffp_disabled(idm_conn_t *ic, idm_ffp_disable_t disable_class) static idm_status_t iscsit_conn_lost(idm_conn_t *ic) { - iscsit_conn_t *ict = ic->ic_handle; + iscsit_conn_t *ict = ic->ic_handle; + iscsit_sess_t *ist = ict->ict_sess; + iscsit_cbuf_t *cbuf; + idm_pdu_t *rx_pdu; + int i; mutex_enter(&ict->ict_mutex); ict->ict_lost = B_TRUE; mutex_exit(&ict->ict_mutex); - + /* + * scrub the staging queue for all PDUs on this connection + */ + if (ist != NULL) { + mutex_enter(&ist->ist_sn_mutex); + for (cbuf = ist->ist_rxpdu_queue, i = 0; + ((cbuf->cb_num_elems > 0) && (i < ISCSIT_RXPDU_QUEUE_LEN)); + i++) { + if (((rx_pdu = cbuf->cb_buffer[i]) != NULL) && + (rx_pdu->isp_ic == ic)) { + /* conn is lost, drop the pdu */ + DTRACE_PROBE3(scrubbing__staging__queue, + iscsit_sess_t *, ist, idm_conn_t *, ic, + idm_pdu_t *, rx_pdu); + idm_pdu_complete(rx_pdu, IDM_STATUS_FAIL); + cbuf->cb_buffer[i] = NULL; + cbuf->cb_num_elems--; + iscsit_conn_dispatch_rele(ict); + } + } + mutex_exit(&ist->ist_sn_mutex); + } /* * Make sure there aren't any PDU's transitioning from the receive * handler to the dispatch taskq. @@ -1431,20 +1512,20 @@ iscsit_xfer_scsi_data(scsi_task_t *task, stmf_data_buf_t *dbuf, * access to the SN values. We need to lock here to enforce * lock ordering */ - rw_enter(&ict_sess->ist_sn_rwlock, RW_READER); + mutex_enter(&ict_sess->ist_sn_mutex); idm_rc = idm_buf_tx_to_ini(iscsit_task->it_idm_task, ibuf->ibuf_idm_buf, dbuf->db_relative_offset, dbuf->db_data_size, &iscsit_buf_xfer_cb, dbuf); - rw_exit(&ict_sess->ist_sn_rwlock); + mutex_exit(&ict_sess->ist_sn_mutex); return (iscsit_idm_to_stmf(idm_rc)); } else if (dbuf->db_flags & DB_DIRECTION_FROM_RPORT) { /* Grab the SN lock (see comment above) */ - rw_enter(&ict_sess->ist_sn_rwlock, RW_READER); + mutex_enter(&ict_sess->ist_sn_mutex); idm_rc = idm_buf_rx_from_ini(iscsit_task->it_idm_task, ibuf->ibuf_idm_buf, dbuf->db_relative_offset, dbuf->db_data_size, &iscsit_buf_xfer_cb, dbuf); - rw_exit(&ict_sess->ist_sn_rwlock); + mutex_exit(&ict_sess->ist_sn_mutex); return (iscsit_idm_to_stmf(idm_rc)); } @@ -1821,13 +1902,23 @@ iscsit_idm_to_stmf(idm_status_t idmrc) /*NOTREACHED*/ } +void +iscsit_op_scsi_cmd(idm_conn_t *ic, idm_pdu_t *rx_pdu) +{ + iscsit_conn_t *ict = ic->ic_handle; + + if (iscsit_check_cmdsn_and_queue(rx_pdu)) { + iscsit_post_scsi_cmd(ic, rx_pdu); + } + iscsit_process_pdu_in_queue(ict->ict_sess); +} /* * ISCSI protocol */ void -iscsit_op_scsi_cmd(idm_conn_t *ic, idm_pdu_t *rx_pdu) +iscsit_post_scsi_cmd(idm_conn_t *ic, idm_pdu_t *rx_pdu) { iscsit_conn_t *ict; iscsit_task_t *itask; @@ -1851,7 +1942,6 @@ iscsit_op_scsi_cmd(idm_conn_t *ic, idm_pdu_t *rx_pdu) return; } - /* * Note CmdSN and ITT in task. IDM will have already validated this * request against the connection state so we don't need to check @@ -2038,7 +2128,6 @@ iscsit_op_scsi_cmd(idm_conn_t *ic, idm_pdu_t *rx_pdu) uint32_t, ibuf->ibuf_stmf_buf->db_relative_offset, uint64_t, 0, uint32_t, 0, uint32_t, 0, /* no raddr */ uint32_t, rx_pdu->isp_datalen, int, XFER_BUF_TX_TO_INI); - stmf_post_task(task, ibuf->ibuf_stmf_buf); } else { @@ -2085,25 +2174,39 @@ iscsit_deferred_dispatch(idm_pdu_t *rx_pdu) static void iscsit_deferred(void *rx_pdu_void) { - idm_pdu_t *rx_pdu = rx_pdu_void; - idm_conn_t *ic = rx_pdu->isp_ic; - iscsit_conn_t *ict = ic->ic_handle; + idm_pdu_t *rx_pdu = rx_pdu_void; + idm_conn_t *ic = rx_pdu->isp_ic; + iscsit_conn_t *ict = ic->ic_handle; + /* + * NOP and Task Management Commands can be marked for immediate + * delivery. Commands marked as 'Immediate' are to be considered + * for execution as soon as they arrive on the target. So these + * should not be checked for sequence order and put in a queue. + * The CmdSN is not advanced for Immediate Commands. + */ switch (IDM_PDU_OPCODE(rx_pdu)) { case ISCSI_OP_NOOP_OUT: - iscsit_set_cmdsn(ict, rx_pdu); - iscsit_pdu_op_noop(ict, rx_pdu); + if (iscsit_check_cmdsn_and_queue(rx_pdu)) { + iscsit_set_cmdsn(ict, rx_pdu); + iscsit_pdu_op_noop(ict, rx_pdu); + } break; case ISCSI_OP_LOGIN_CMD: iscsit_pdu_op_login_cmd(ict, rx_pdu); - break; + iscsit_conn_dispatch_rele(ict); + return; case ISCSI_OP_TEXT_CMD: - iscsit_set_cmdsn(ict, rx_pdu); - iscsit_pdu_op_text_cmd(ict, rx_pdu); + if (iscsit_check_cmdsn_and_queue(rx_pdu)) { + iscsit_set_cmdsn(ict, rx_pdu); + iscsit_pdu_op_text_cmd(ict, rx_pdu); + } break; case ISCSI_OP_LOGOUT_CMD: - iscsit_set_cmdsn(ict, rx_pdu); - iscsit_pdu_op_logout_cmd(ict, rx_pdu); + if (iscsit_check_cmdsn_and_queue(rx_pdu)) { + iscsit_set_cmdsn(ict, rx_pdu); + iscsit_pdu_op_logout_cmd(ict, rx_pdu); + } break; default: /* Protocol error. IDM should have caught this */ @@ -2111,6 +2214,11 @@ iscsit_deferred(void *rx_pdu_void) ASSERT(0); break; } + /* + * Check if there are other PDUs in the session staging queue + * waiting to be posted to SCSI layer. + */ + iscsit_process_pdu_in_queue(ict->ict_sess); iscsit_conn_dispatch_rele(ict); } @@ -2240,14 +2348,20 @@ iscsit_op_scsi_task_mgmt(iscsit_conn_t *ict, idm_pdu_t *rx_pdu) refcmdsn = ntohl(iscsi_tm->refcmdsn); /* - * Task was not found. If RefCmdSN is within the CmdSN - * window and less than CmdSN of the TM function, return - * "Function Complete". Otherwise, return - * "Task Does Not Exist". + * Task was not found. But the SCSI command could be + * on the rxpdu wait queue. If RefCmdSN is within + * the CmdSN window and less than CmdSN of the TM + * function, return "Function Complete". Otherwise, + * return "Task Does Not Exist". */ if (iscsit_cmdsn_in_window(ict, refcmdsn) && - (refcmdsn < cmdsn)) { + iscsit_sna_lt(refcmdsn, cmdsn)) { + mutex_enter(&ict->ict_sess->ist_sn_mutex); + (void) iscsit_remove_pdu_from_queue( + ict->ict_sess, refcmdsn); + iscsit_conn_dispatch_rele(ict); + mutex_exit(&ict->ict_sess->ist_sn_mutex); iscsit_send_task_mgmt_resp(tm_resp_pdu, SCSI_TCP_TM_RESP_COMPLETE); } else { @@ -2473,8 +2587,12 @@ iscsit_pdu_op_logout_cmd(iscsit_conn_t *ict, idm_pdu_t *rx_pdu) int iscsit_cmd_window() { - /* Will be better later */ - return (1024); + /* + * Instead of using a pre-defined constant for the command window, + * it should be made confiurable and dynamic. With MC/S, sequence + * numbers will be used up at a much faster rate than with SC/S. + */ + return (ISCSIT_MAX_WINDOW); } /* @@ -2489,11 +2607,16 @@ iscsit_set_cmdsn(iscsit_conn_t *ict, idm_pdu_t *rx_pdu) ist = ict->ict_sess; req = (iscsi_scsi_cmd_hdr_t *)rx_pdu->isp_hdr; + if (req->opcode & ISCSI_OP_IMMEDIATE) { + /* no cmdsn increment for immediate PDUs */ + return; + } - rw_enter(&ist->ist_sn_rwlock, RW_WRITER); + /* Ensure that the ExpCmdSN advances in an orderly manner */ + mutex_enter(&ist->ist_sn_mutex); ist->ist_expcmdsn = ntohl(req->cmdsn) + 1; ist->ist_maxcmdsn = ntohl(req->cmdsn) + iscsit_cmd_window(); - rw_exit(&ist->ist_sn_rwlock); + mutex_exit(&ist->ist_sn_mutex); } /* @@ -2509,16 +2632,16 @@ iscsit_pdu_tx(idm_pdu_t *pdu) /* * The command sequence numbers are session-wide and must stay * consistent across the transfer, so protect the cmdsn with a - * reader lock on the session. The status sequence number will + * mutex lock on the session. The status sequence number will * be updated just before the transport layer transmits the PDU. */ - rw_enter(&ict->ict_sess->ist_sn_rwlock, RW_READER); + mutex_enter(&ict->ict_sess->ist_sn_mutex); /* Set ExpCmdSN and MaxCmdSN */ rsp->maxcmdsn = htonl(ist->ist_maxcmdsn); rsp->expcmdsn = htonl(ist->ist_expcmdsn); idm_pdu_tx(pdu); - rw_exit(&ict->ict_sess->ist_sn_rwlock); + mutex_exit(&ict->ict_sess->ist_sn_mutex); } /* @@ -2916,7 +3039,7 @@ iscsit_cmdsn_in_window(iscsit_conn_t *ict, uint32_t cmdsn) ist = ict->ict_sess; - rw_enter(&ist->ist_sn_rwlock, RW_READER); + mutex_enter(&ist->ist_sn_mutex); /* * If cmdsn is less than ist_expcmdsn - iscsit_cmd_window() or @@ -2928,7 +3051,335 @@ iscsit_cmdsn_in_window(iscsit_conn_t *ict, uint32_t cmdsn) rval = B_FALSE; } - rw_exit(&ist->ist_sn_rwlock); + mutex_exit(&ist->ist_sn_mutex); return (rval); } + +/* + * iscsit_check_cmdsn_and_queue + * + * Independent of the order in which the iSCSI target receives non-immediate + * command PDU across the entire session and any multiple connections within + * the session, the target must deliver the commands to the SCSI layer in + * CmdSN order. So out-of-order non-immediate commands are queued up on a + * session-wide wait queue. Duplicate commands are ignored. + * + */ +static int +iscsit_check_cmdsn_and_queue(idm_pdu_t *rx_pdu) +{ + idm_conn_t *ic = rx_pdu->isp_ic; + iscsit_conn_t *ict = ic->ic_handle; + iscsit_sess_t *ist = ict->ict_sess; + iscsi_scsi_cmd_hdr_t *hdr = (iscsi_scsi_cmd_hdr_t *)rx_pdu->isp_hdr; + + mutex_enter(&ist->ist_sn_mutex); + if (hdr->opcode & ISCSI_OP_IMMEDIATE) { + /* do not queue, handle it immediately */ + DTRACE_PROBE2(immediate__cmd, iscsit_sess_t *, ist, + idm_pdu_t *, rx_pdu); + mutex_exit(&ist->ist_sn_mutex); + return (ISCSIT_CMDSN_EQ_EXPCMDSN); + } + if (iscsit_sna_lt(ist->ist_expcmdsn, ntohl(hdr->cmdsn))) { + /* + * Out-of-order commands (cmdSN higher than ExpCmdSN) + * are staged on a fixed-size circular buffer until + * the missing command is delivered to the SCSI layer. + * Irrespective of the order of insertion into the + * staging queue, the commands are processed out of the + * queue in cmdSN order only. + */ + rx_pdu->isp_queue_time = ddi_get_time(); + iscsit_add_pdu_to_queue(ist, rx_pdu); + mutex_exit(&ist->ist_sn_mutex); + return (ISCSIT_CMDSN_GT_EXPCMDSN); + } else if (iscsit_sna_lt(ntohl(hdr->cmdsn), ist->ist_expcmdsn)) { + DTRACE_PROBE3(cmdsn__lt__expcmdsn, iscsit_sess_t *, ist, + iscsit_conn_t *, ict, idm_pdu_t *, rx_pdu); + mutex_exit(&ist->ist_sn_mutex); + return (ISCSIT_CMDSN_LT_EXPCMDSN); + } else { + mutex_exit(&ist->ist_sn_mutex); + return (ISCSIT_CMDSN_EQ_EXPCMDSN); + } +} + +/* + * iscsit_add_pdu_to_queue() adds PDUs into the array indexed by + * their cmdsn value. The length of the array is kept above the + * maximum window size. The window keeps the cmdsn within a range + * such that there are no collisons. e.g. the assumption is that + * the windowing checks make it impossible to receive PDUs that + * index into the same location in the array. + */ +static void +iscsit_add_pdu_to_queue(iscsit_sess_t *ist, idm_pdu_t *rx_pdu) +{ + iscsit_cbuf_t *cbuf = ist->ist_rxpdu_queue; + iscsit_conn_t *ict = rx_pdu->isp_ic->ic_handle; + uint32_t cmdsn = + ((iscsi_scsi_cmd_hdr_t *)rx_pdu->isp_hdr)->cmdsn; + uint32_t index; + + ASSERT(MUTEX_HELD(&ist->ist_sn_mutex)); + /* + * If the connection is being torn down, then + * don't add the PDU to the staging queue + */ + mutex_enter(&ict->ict_mutex); + if (ict->ict_lost) { + mutex_exit(&ict->ict_mutex); + idm_pdu_complete(rx_pdu, IDM_STATUS_FAIL); + return; + } + iscsit_conn_dispatch_hold(ict); + mutex_exit(&ict->ict_mutex); + + index = ntohl(cmdsn) % ISCSIT_RXPDU_QUEUE_LEN; + ASSERT(cbuf->cb_buffer[index] == NULL); + cbuf->cb_buffer[index] = rx_pdu; + cbuf->cb_num_elems++; +} + +static idm_pdu_t * +iscsit_remove_pdu_from_queue(iscsit_sess_t *ist, uint32_t cmdsn) +{ + iscsit_cbuf_t *cbuf = ist->ist_rxpdu_queue; + idm_pdu_t *pdu = NULL; + uint32_t index; + + ASSERT(MUTEX_HELD(&ist->ist_sn_mutex)); + index = cmdsn % ISCSIT_RXPDU_QUEUE_LEN; + if ((pdu = cbuf->cb_buffer[index]) != NULL) { + ASSERT(cmdsn == + ntohl(((iscsi_scsi_cmd_hdr_t *)pdu->isp_hdr)->cmdsn)); + cbuf->cb_buffer[index] = NULL; + cbuf->cb_num_elems--; + return (pdu); + } + return (NULL); +} + +/* + * iscsit_process_pdu_in_queue() finds the next pdu in sequence + * and posts it to the SCSI layer + */ +static void +iscsit_process_pdu_in_queue(iscsit_sess_t *ist) +{ + iscsit_cbuf_t *cbuf = ist->ist_rxpdu_queue; + idm_pdu_t *pdu = NULL; + uint32_t expcmdsn; + + for (;;) { + mutex_enter(&ist->ist_sn_mutex); + if (cbuf->cb_num_elems == 0) { + mutex_exit(&ist->ist_sn_mutex); + break; + } + expcmdsn = ist->ist_expcmdsn; + if ((pdu = iscsit_remove_pdu_from_queue(ist, expcmdsn)) + == NULL) { + mutex_exit(&ist->ist_sn_mutex); + break; + } + mutex_exit(&ist->ist_sn_mutex); + iscsit_post_staged_pdu(pdu); + } +} + +static void +iscsit_post_staged_pdu(idm_pdu_t *rx_pdu) +{ + iscsit_conn_t *ict = rx_pdu->isp_ic->ic_handle; + + /* Post the PDU to the SCSI layer */ + switch (IDM_PDU_OPCODE(rx_pdu)) { + case ISCSI_OP_NOOP_OUT: + iscsit_set_cmdsn(ict, rx_pdu); + iscsit_pdu_op_noop(ict, rx_pdu); + break; + case ISCSI_OP_TEXT_CMD: + iscsit_set_cmdsn(ict, rx_pdu); + iscsit_pdu_op_text_cmd(ict, rx_pdu); + break; + case ISCSI_OP_SCSI_TASK_MGT_MSG: + iscsit_set_cmdsn(ict, rx_pdu); + iscsit_op_scsi_task_mgmt(ict, rx_pdu); + break; + case ISCSI_OP_SCSI_CMD: + /* cmdSN will be incremented after creating itask */ + iscsit_post_scsi_cmd(rx_pdu->isp_ic, rx_pdu); + break; + case ISCSI_OP_LOGOUT_CMD: + iscsit_set_cmdsn(ict, rx_pdu); + iscsit_pdu_op_logout_cmd(ict, rx_pdu); + break; + default: + /* No other PDUs should be placed on the queue */ + ASSERT(0); + } + iscsit_conn_dispatch_rele(ict); /* release hold on the conn */ +} + +/* ARGSUSED */ +void +iscsit_rxpdu_queue_monitor_start(void) +{ + mutex_enter(&iscsit_rxpdu_queue_monitor_mutex); + if (iscsit_rxpdu_queue_monitor_thr_running) { + mutex_exit(&iscsit_rxpdu_queue_monitor_mutex); + return; + } + iscsit_rxpdu_queue_monitor_thr_id = + thread_create(NULL, 0, iscsit_rxpdu_queue_monitor, NULL, + 0, &p0, TS_RUN, minclsyspri); + while (!iscsit_rxpdu_queue_monitor_thr_running) { + cv_wait(&iscsit_rxpdu_queue_monitor_cv, + &iscsit_rxpdu_queue_monitor_mutex); + } + mutex_exit(&iscsit_rxpdu_queue_monitor_mutex); + +} + +/* ARGSUSED */ +void +iscsit_rxpdu_queue_monitor_stop(void) +{ + mutex_enter(&iscsit_rxpdu_queue_monitor_mutex); + if (iscsit_rxpdu_queue_monitor_thr_running) { + iscsit_rxpdu_queue_monitor_thr_running = B_FALSE; + cv_signal(&iscsit_rxpdu_queue_monitor_cv); + mutex_exit(&iscsit_rxpdu_queue_monitor_mutex); + + thread_join(iscsit_rxpdu_queue_monitor_thr_did); + return; + } + mutex_exit(&iscsit_rxpdu_queue_monitor_mutex); +} + +/* + * A separate thread is used to scan the staging queue on all the + * sessions, If a delayed PDU does not arrive within a timeout, the + * target will advance to the staged PDU that is next in sequence + * and exceeded the threshold wait time. It is up to the initiator + * to note that the target has not acknowledged a particular cmdsn + * and take appropriate action. + */ +/* ARGSUSED */ +static void +iscsit_rxpdu_queue_monitor(void *arg) +{ + iscsit_tgt_t *tgt; + iscsit_sess_t *ist; + + mutex_enter(&iscsit_rxpdu_queue_monitor_mutex); + iscsit_rxpdu_queue_monitor_thr_did = curthread->t_did; + iscsit_rxpdu_queue_monitor_thr_running = B_TRUE; + cv_signal(&iscsit_rxpdu_queue_monitor_cv); + + while (iscsit_rxpdu_queue_monitor_thr_running) { + ISCSIT_GLOBAL_LOCK(RW_READER); + for (tgt = avl_first(&iscsit_global.global_target_list); + tgt != NULL; + tgt = AVL_NEXT(&iscsit_global.global_target_list, tgt)) { + mutex_enter(&tgt->target_mutex); + for (ist = avl_first(&tgt->target_sess_list); + ist != NULL; + ist = AVL_NEXT(&tgt->target_sess_list, ist)) { + + iscsit_rxpdu_queue_monitor_session(ist); + } + mutex_exit(&tgt->target_mutex); + } + ISCSIT_GLOBAL_UNLOCK(); + if (iscsit_rxpdu_queue_monitor_thr_running == B_FALSE) { + break; + } + (void) cv_reltimedwait(&iscsit_rxpdu_queue_monitor_cv, + &iscsit_rxpdu_queue_monitor_mutex, + ISCSIT_RXPDU_QUEUE_MONITOR_INTERVAL * drv_usectohz(1000000), + TR_CLOCK_TICK); + } + mutex_exit(&iscsit_rxpdu_queue_monitor_mutex); + thread_exit(); +} + +static void +iscsit_rxpdu_queue_monitor_session(iscsit_sess_t *ist) +{ + iscsit_cbuf_t *cbuf = ist->ist_rxpdu_queue; + idm_pdu_t *next_pdu = NULL; + uint32_t index, next_cmdsn, i; + + /* + * Assume that all PDUs in the staging queue have a cmdsn >= expcmdsn. + * Starting with the expcmdsn, iterate over the staged PDUs to find + * the next PDU with a wait time greater than the threshold. If found + * advance the staged PDU to the SCSI layer, skipping over the missing + * PDU(s) to get past the hole in the command sequence. It is up to + * the initiator to note that the target has not acknowledged a cmdsn + * and take appropriate action. + * + * Since the PDU(s) arrive in any random order, it is possible that + * that the actual wait time for a particular PDU is much longer than + * the defined threshold. e.g. Consider a case where commands are sent + * over 4 different connections, and cmdsn = 1004 arrives first, then + * 1003, and 1002 and 1001 are lost due to a connection failure. + * So now 1003 is waiting for 1002 to be delivered, and although the + * wait time of 1004 > wait time of 1003, only 1003 will be considered + * by the monitor thread. 1004 will be automatically processed by + * iscsit_process_pdu_in_queue() once the scan is complete and the + * expcmdsn becomes current. + */ + mutex_enter(&ist->ist_sn_mutex); + cbuf = ist->ist_rxpdu_queue; + if (cbuf->cb_num_elems == 0) { + mutex_exit(&ist->ist_sn_mutex); + return; + } + for (next_pdu = NULL, i = 0; ; i++) { + next_cmdsn = ist->ist_expcmdsn + i; /* start at expcmdsn */ + index = next_cmdsn % ISCSIT_RXPDU_QUEUE_LEN; + if ((next_pdu = cbuf->cb_buffer[index]) != NULL) { + /* + * If the PDU wait time has not exceeded threshold + * stop scanning the staging queue until the timer + * fires again + */ + if ((ddi_get_time() - next_pdu->isp_queue_time) + < rxpdu_queue_threshold) { + mutex_exit(&ist->ist_sn_mutex); + return; + } + /* + * Remove the next PDU from the queue and post it + * to the SCSI layer, skipping over the missing + * PDU. Stop scanning the staging queue until + * the monitor timer fires again + */ + (void) iscsit_remove_pdu_from_queue(ist, next_cmdsn); + mutex_exit(&ist->ist_sn_mutex); + DTRACE_PROBE3(advanced__to__blocked__cmdsn, + iscsit_sess_t *, ist, idm_pdu_t *, next_pdu, + uint32_t, next_cmdsn); + iscsit_post_staged_pdu(next_pdu); + /* Deliver any subsequent PDUs immediately */ + iscsit_process_pdu_in_queue(ist); + return; + } + /* + * Skipping over i PDUs, e.g. a case where commands 1001 and + * 1002 are lost in the network, skip over both and post 1003 + * expcmdsn then becomes 1004 at the end of the scan. + */ + DTRACE_PROBE2(skipping__over__cmdsn, iscsit_sess_t *, ist, + uint32_t, next_cmdsn); + } + /* + * following the assumption, staged cmdsn >= expcmdsn, this statement + * is never reached. + */ +} diff --git a/usr/src/uts/common/io/comstar/port/iscsit/iscsit.h b/usr/src/uts/common/io/comstar/port/iscsit/iscsit.h index 0f8035cca4..1da86e4138 100644 --- a/usr/src/uts/common/io/comstar/port/iscsit/iscsit.h +++ b/usr/src/uts/common/io/comstar/port/iscsit/iscsit.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _ISCSIT_H_ #define _ISCSIT_H_ @@ -36,7 +35,7 @@ */ #define ISCSIT_MIN_VERSION 0x00 #define ISCSIT_MAX_VERSION 0x00 -#define ISCSIT_MAX_CONNECTIONS 1 /* No MC/S support */ +#define ISCSIT_MAX_CONNECTIONS 32 /* MC/S support */ #define ISCSIT_MAX_RECV_DATA_SEGMENT_LENGTH (32*1024) #define ISCSIT_MAX_BURST_LENGTH (512*1024) #define ISCSIT_MAX_FIRST_BURST_LENGTH ISCSI_DEFAULT_FIRST_BURST_LENGTH @@ -55,12 +54,34 @@ /* Max targets per system */ #define ISCSIT_MAX_TARGETS 1024 +#define ISCSIT_MAX_WINDOW 1024 +#define ISCSIT_RXPDU_QUEUE_LEN 2048 + +#define ISCSIT_CMDSN_LT_EXPCMDSN -1 +#define ISCSIT_CMDSN_EQ_EXPCMDSN 1 +#define ISCSIT_CMDSN_GT_EXPCMDSN 0 +/* + * MC/S: A timeout is maintained to recover from lost CmdSN (holes in the + * CmdSN ordering). When the timeout is reached, the ExpCmdSN is advanced + * past the hole to continue processing the queued commands. This value is + * system-tunable (volatile rxpdu_queue_threshold) and should be in the + * range from 5 to 30 seconds. + */ +#define ISCSIT_RXPDU_QUEUE_THRESHOLD 5 /* 5 seconds */ +#define ISCSIT_RXPDU_QUEUE_MONITOR_INTERVAL 5 /* 5 seconds */ + /* Time in seconds to wait between calls to stmf_deregister_local_port */ #define TGT_DEREG_RETRY_SECONDS 1 #define ISCSIT_GLOBAL_LOCK(rw) rw_enter(&iscsit_global.global_rwlock, (rw)) #define ISCSIT_GLOBAL_UNLOCK() rw_exit(&iscsit_global.global_rwlock) +/* Circular buffer to hold the out-of-order PDUs in MC/S */ +typedef struct { + idm_pdu_t *cb_buffer[ISCSIT_RXPDU_QUEUE_LEN]; + int cb_num_elems; +} iscsit_cbuf_t; + /* * Used for serial number arithmetic (RFC 1982) */ @@ -337,7 +358,7 @@ typedef struct { iscsit_tgt_t *ist_tgt; idm_refcnt_t ist_refcnt; kmem_cache_t *ist_task_cache; - krwlock_t ist_sn_rwlock; + kmutex_t ist_sn_mutex; kmutex_t ist_mutex; kcondvar_t ist_cv; iscsit_session_state_t ist_state; @@ -363,6 +384,7 @@ typedef struct { uint32_t ist_expcmdsn; uint32_t ist_maxcmdsn; avl_tree_t ist_task_list; + iscsit_cbuf_t *ist_rxpdu_queue; } iscsit_sess_t; /* Update iscsit_ils_name table whenever login states are modified */ @@ -822,4 +844,10 @@ iscsit_verify_chap_resp(iscsit_conn_login_t *lsm, unsigned int chap_i, uchar_t *chap_c, unsigned int challenge_len, uchar_t *chap_r, unsigned int resp_len); +void +iscsit_rxpdu_queue_monitor_start(void); + +void +iscsit_rxpdu_queue_monitor_stop(void); + #endif /* _ISCSIT_H_ */ diff --git a/usr/src/uts/common/io/comstar/port/iscsit/iscsit_sess.c b/usr/src/uts/common/io/comstar/port/iscsit/iscsit_sess.c index cd98741de9..39eda2afb7 100644 --- a/usr/src/uts/common/io/comstar/port/iscsit/iscsit_sess.c +++ b/usr/src/uts/common/io/comstar/port/iscsit/iscsit_sess.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/cpuvar.h> @@ -139,7 +138,7 @@ iscsit_sess_create(iscsit_tgt_t *tgt, iscsit_conn_t *ict, } idm_sm_audit_init(&result->ist_state_audit); - rw_init(&result->ist_sn_rwlock, NULL, RW_DRIVER, NULL); + mutex_init(&result->ist_sn_mutex, NULL, MUTEX_DEFAULT, NULL); mutex_init(&result->ist_mutex, NULL, MUTEX_DEFAULT, NULL); cv_init(&result->ist_cv, NULL, CV_DEFAULT, NULL); list_create(&result->ist_events, sizeof (sess_event_ctx_t), @@ -148,7 +147,7 @@ iscsit_sess_create(iscsit_tgt_t *tgt, iscsit_conn_t *ict, offsetof(iscsit_conn_t, ict_sess_ln)); avl_create(&result->ist_task_list, iscsit_task_itt_compare, sizeof (iscsit_task_t), offsetof(iscsit_task_t, it_sess_ln)); - + result->ist_rxpdu_queue = kmem_zalloc(sizeof (iscsit_cbuf_t), KM_SLEEP); result->ist_state = SS_Q1_FREE; result->ist_last_state = SS_Q1_FREE; bcopy(isid, result->ist_isid, ISCSI_ISID_LEN); @@ -252,11 +251,12 @@ iscsit_sess_destroy(iscsit_sess_t *ist) kmem_free(ist->ist_target_alias, strlen(ist->ist_target_alias) + 1); avl_destroy(&ist->ist_task_list); + kmem_free(ist->ist_rxpdu_queue, sizeof (iscsit_cbuf_t)); list_destroy(&ist->ist_conn_list); list_destroy(&ist->ist_events); cv_destroy(&ist->ist_cv); mutex_destroy(&ist->ist_mutex); - rw_destroy(&ist->ist_sn_rwlock); + mutex_destroy(&ist->ist_sn_mutex); kmem_free(ist, sizeof (*ist)); } diff --git a/usr/src/uts/common/io/idm/idm.c b/usr/src/uts/common/io/idm/idm.c index b2c2c05a2c..23bdb1c44a 100644 --- a/usr/src/uts/common/io/idm/idm.c +++ b/usr/src/uts/common/io/idm/idm.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/cpuvar.h> @@ -54,7 +53,6 @@ static struct modlinkage modlinkage = { MODREV_1, (void *)&modlmisc, NULL }; -extern int idm_task_compare(const void *t1, const void *t2); extern void idm_wd_thread(void *arg); static int _idm_init(void); diff --git a/usr/src/uts/common/io/idm/idm_conn_sm.c b/usr/src/uts/common/io/idm/idm_conn_sm.c index 88dfa78922..189a6ae3cd 100644 --- a/usr/src/uts/common/io/idm/idm_conn_sm.c +++ b/usr/src/uts/common/io/idm/idm_conn_sm.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/cpuvar.h> @@ -633,6 +632,8 @@ static void idm_state_s5_logged_in(idm_conn_t *ic, idm_conn_event_ctx_t *event_ctx) { switch (event_ctx->iec_event) { + case CE_MISC_RX: + /* MC/S: when removing the non-leading connection */ case CE_LOGOUT_THIS_CONN_RCV: case CE_LOGOUT_THIS_CONN_SND: case CE_LOGOUT_OTHER_CONN_RCV: @@ -675,7 +676,6 @@ idm_state_s5_logged_in(idm_conn_t *ic, idm_conn_event_ctx_t *event_ctx) idm_update_state(ic, CS_S8_CLEANUP, event_ctx); break; case CE_MISC_TX: - case CE_MISC_RX: case CE_TX_PROTOCOL_ERROR: case CE_RX_PROTOCOL_ERROR: case CE_LOGIN_TIMEOUT: diff --git a/usr/src/uts/common/sys/idm/idm_impl.h b/usr/src/uts/common/sys/idm/idm_impl.h index c015216018..11e9534687 100644 --- a/usr/src/uts/common/sys/idm/idm_impl.h +++ b/usr/src/uts/common/sys/idm/idm_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _IDM_IMPL_H_ #define _IDM_IMPL_H_ @@ -373,6 +372,7 @@ typedef struct idm_pdu_s { uint32_t isp_flags; uint_t isp_hdrbuflen; uint_t isp_databuflen; + time_t isp_queue_time; } idm_pdu_t; /* @@ -522,8 +522,6 @@ uint32_t idm_crc32c_continued(void *address, unsigned long length, void idm_listbuf_insert(list_t *lst, idm_buf_t *buf); -int idm_task_compare(const void *v1, const void *v2); - idm_conn_t *idm_lookup_conn(uint8_t *isid, uint16_t tsih, uint16_t cid); #ifdef __cplusplus |