nvme物理層基於高速的pcie接口,pcie3.0一個lane就已經達到了8Gb/s的速度。x2 x4 x8 … N個lane在一起就是N倍的速度,非常強大。另外加上nvme協議本身的簡潔高效,使得協議層的消耗進一步降低,最終達到高速的傳輸效果。
出於好奇和興趣,想要學習一下nvme。協議自然是看最新的pcie3.0和nvme1.2。但是只停留在文檔和協議始終只是紙上談兵,必須要進行一點實踐才能理解深刻。要進行實踐,無非是兩條路。一是從設備端學習固件的nvme模塊,二就是從主機端學習nvme的驅動。如今沒再從事ssd的工作了,所以想走第一條路基本是不可能了,只能在第二條路上做文章。要是手上有一個nvme的ssd,那對於實驗是非常不錯的,可惜我沒有,相信大多數的人也沒有。然而曲線也能救國,總有一條路是給我們准備的,那就是qemu了!
qemu是一個模擬器,可以模擬x86 arm powerpc等等等等,而且支持nvme設備的模擬(不過當前貌似nvme設備只在x86下支持,我嘗試過ARM+PCI的環境,但是qemu表示不支持nvme設備)。主機端的系統當然是選擇linux,因為我們可以很方便的獲得nvme相關的一切代碼。
那…接下來到底要怎麼上手哪?linux只是一個內核,還需要rootfs,需要這需要那。我們在這裡想要學習的是nvme驅動,而不是如何從零開始搭建一個linux系統,所以我們需要一條快速而又便捷的道路。這時候,我就要給大家介紹下buildroot了。有了他,一切全搞定!^_^當然,前提是我們需要一個linux的主機(推薦ubuntu,個人比較喜歡,用的人也多,出了問題比較容易在網上找到攻略,最新的可以安裝16.04),並安裝了qemu。
buildroot最新的版本可以從這裡下載:https://buildroot.org/downloads/buildroot-2016.05.tar.bz2
解壓後運行
make qemu_x86_64_defconfig
make
編譯完成後,根據board/qemu/x86_64/readme.txt裡描述的命令,可以在qemu裡運行起來剛才編譯好的linux系統。
qemu-system-x86_64 -M pc -kernel output/images/bzImage -drive file=output/images/rootfs.ext2,if=virtio,format=raw -append root=/dev/vda -net nic,model=virtio -net user
這個默認的系統log輸出被重定向到了虛擬機的“屏幕”上,而非shell上,不能回滾,使得調試起來很不方便。我們需要修改一些東西把log重定向到linux的shell上。首先是編輯buildroot目錄下的.config文件。
BR2_TARGET_GENERIC_GETTY_PORT="tty1"
改成
BR2_TARGET_GENERIC_GETTY_PORT="ttyS0"
然後重新編譯。等到編譯完成後運行下面修改過的命令,就得到我們想要的結果了。
make
qemu-system-x86_64 -M pc -kernel output/images/bzImage -drive file=output/images/rootfs.ext2,if=virtio,format=raw -append "console=ttyS0 root=/dev/vda" -net nic,model=virtio -net user -serial stdio
下面,我們再修改一些命令,加上nvme的支持。<喎?http://www.2cto.com/kf/ware/vc/" target="_blank" class="keylink">vcD4NCjxwcmUgY2xhc3M9"brush:java;">
qemu-img create -f raw nvme.img 1G
qemu-system-x86_64 -M pc -kernel output/images/bzImage -drive file=output/images/rootfs.ext2,if=virtio,format=raw -append "console=ttyS0 root=/dev/vda" -net nic,model=virtio -net user -serial stdio -drive file=nvme.img,if=none,format=raw,id=drv0 -device nvme,drive=drv0,serial=foo
linux系統起來後,我們可以在/dev下面查看到nvme相關的設備了。
# ls -l /dev
crw------- 1 root root 253, 0 Jun 3 13:00 nvme0
brw------- 1 root root 259, 0 Jun 3 13:00 nvme0n1
自此,我們的動手實踐稍作暫停,可以去學習下nvme的代碼了。在遇到問題的時候,我們可以修改代碼並在qemu裡運行查看效果,真棒!
nvme驅動代碼的分析基於linux內核版本4.5.3,為什麼選擇這個版本?主要是因為buildroot-2016.05默認選擇的是這個版本的內核。我們也可以手動修改內核的版本,但這裡就不做詳述了。nvme的代碼位於drivers/nvme目錄內,文件不多,主要就兩個文件:core.c和pci.c。
分析驅動,首先是要找到這個驅動的入口。module_init把函數nvme_init聲明為這個驅動的入口,在linux加載過程中會自動被調用。
static int __init nvme_init(void)
{
int result;
init_waitqueue_head(&nvme_kthread_wait);
nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
if (!nvme_workq)
return -ENOMEM;
result = nvme_core_init();
if (result < 0)
goto kill_workq;
result = pci_register_driver(&nvme_driver);
if (result)
goto core_exit;
return 0;
core_exit:
nvme_core_exit();
kill_workq:
destroy_workqueue(nvme_workq);
return result;
}
static void __exit nvme_exit(void)
{
pci_unregister_driver(&nvme_driver);
nvme_core_exit();
destroy_workqueue(nvme_workq);
BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
_nvme_check_size();
}
module_init(nvme_init);
module_exit(nvme_exit);
nvme_init流程分析:
創建一個全局的workqueue,有了這個workqueue之後,很多的work就可以丟到這個workqueue裡執行了。 調用nvme_core_init。 調用pci_register_driver。
int __init nvme_core_init(void)
{
int result;
result = register_blkdev(nvme_major, "nvme");
if (result < 0)
return result;
else if (result > 0)
nvme_major = result;
result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
&nvme_dev_fops);
if (result < 0)
goto unregister_blkdev;
else if (result > 0)
nvme_char_major = result;
nvme_class = class_create(THIS_MODULE, "nvme");
if (IS_ERR(nvme_class)) {
result = PTR_ERR(nvme_class);
goto unregister_chrdev;
}
return 0;
unregister_chrdev:
__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
unregister_blkdev:
unregister_blkdev(nvme_major, "nvme");
return result;
}
nvme_core_init流程分析:
調用register_blkdev注冊一個名字叫nvme的塊設備。 調用__register_chrdev注冊一個名字叫nvme的字符設備。回到nvme_init,pci_register_driver注冊了一個pci驅動。這裡有幾個重要的東西,一個是vendor id和device id,我們可以看到有一條是PCI_VDEVICE(INTEL, 0x5845),有了這個,這個驅動就能跟pci總線枚舉出來的設備匹配起來,從而正確的加載驅動了。
static const struct pci_device_id nvme_id_table[] = {
{ PCI_VDEVICE(INTEL, 0x0953),
.driver_data = NVME_QUIRK_STRIPE_SIZE, },
{ PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);
static struct pci_driver nvme_driver = {
.name = "nvme",
.id_table = nvme_id_table,
.probe = nvme_probe,
.remove = nvme_remove,
.shutdown = nvme_shutdown,
.driver = {
.pm = &nvme_dev_pm_ops,
},
.err_handler = &nvme_err_handler,
};
在linux裡我們通過lspci命令來查看當前的pci設備,發現nvme設備的device id就是0x5845。
# lspci -k
00:00.0 Class 0600: 8086:1237
00:01.0 Class 0601: 8086:7000
00:01.1 Class 0101: 8086:7010 ata_piix
00:01.3 Class 0680: 8086:7113
00:02.0 Class 0300: 1234:1111 bochs-drm
00:03.0 Class 0200: 1af4:1000 virtio-pci
00:04.0 Class 0108: 8086:5845 nvme
00:05.0 Class 0100: 1af4:1001 virtio-pci
pci_register_driver還有一個重要的事情就是設置probe函數。有了probe函數,當設備和驅動匹配了之後,相應驅動的probe函數就會被調用,來實現驅動的加載。所以nvme_init返回後,這個驅動就啥事不做了,直到pci總線枚舉出了這個nvme設備,然後就會調用我們的nvme_probe。
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
int node, result = -ENOMEM;
struct nvme_dev *dev;
node = dev_to_node(&pdev->dev);
if (node == NUMA_NO_NODE)
set_dev_node(&pdev->dev, 0);
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
if (!dev)
return -ENOMEM;
dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry),
GFP_KERNEL, node);
if (!dev->entry)
goto free;
dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
GFP_KERNEL, node);
if (!dev->queues)
goto free;
dev->dev = get_device(&pdev->dev);
pci_set_drvdata(pdev, dev);
result = nvme_dev_map(dev);
if (result)
goto free;
INIT_LIST_HEAD(&dev->node);
INIT_WORK(&dev->scan_work, nvme_dev_scan);
INIT_WORK(&dev->reset_work, nvme_reset_work);
INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
mutex_init(&dev->shutdown_lock);
init_completion(&dev->ioq_wait);
result = nvme_setup_prp_pools(dev);
if (result)
goto put_pci;
result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
id->driver_data);
if (result)
goto release_pools;
queue_work(nvme_workq, &dev->reset_work);
return 0;
release_pools:
nvme_release_prp_pools(dev);
put_pci:
put_device(dev->dev);
nvme_dev_unmap(dev);
free:
kfree(dev->queues);
kfree(dev->entry);
kfree(dev);
return result;
}
nvme_probe流程分析:
為dev、dev->entry、dev->queues分配空間。 調用nvme_dev_map。 初始化三個work變量,關聯回掉函數。 調用nvme_setup_prp_pools。 調用nvme_init_ctrl 通過workqueue調度dev->reset_work,也就是調度nvme_reset_work函數。
static int nvme_dev_map(struct nvme_dev *dev)
{
int bars;
struct pci_dev *pdev = to_pci_dev(dev->dev);
bars = pci_select_bars(pdev, IORESOURCE_MEM);
if (!bars)
return -ENODEV;
if (pci_request_selected_regions(pdev, bars, "nvme"))
return -ENODEV;
dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
if (!dev->bar)
goto release;
return 0;
release:
pci_release_regions(pdev);
return -ENODEV;
}
nvme_dev_map流程分析:
調用pci_select_bars,這個函數的返回值是一個mask值,每一位代表一個bar(base address register),哪一位被置位了,就代表哪一個bar為非零。這個涉及到pci的協議,pci協議裡規定了pci設備的配置空間裡有6個32位的bar寄存器,代表了pci設備上的一段內存空間(memory、io)。
make linux-rebuild
調用pci_request_selected_regions,這個函數的一個參數就是之前調用pci_select_bars返回的mask值,作用就是把對應的這個幾個bar保留起來,不讓別人使用。
# cat /proc/iomem
08000000-febfffff : PCI Bus 0000:00
fd000000-fdffffff : 0000:00:02.0
fd000000-fdffffff : bochs-drm
feb80000-febbffff : 0000:00:03.0
febc0000-febcffff : 0000:00:02.0
febd0000-febd1fff : 0000:00:04.0
febd2000-febd2fff : 0000:00:02.0
febd2000-febd2fff : bochs-drm
febd3000-febd3fff : 0000:00:03.0
febd4000-febd4fff : 0000:00:04.0
febd5000-febd5fff : 0000:00:05.0
調用pci_request_selected_regions的話/proc/iomem如下,會多出兩項nvme,bar0對應的物理地址就是0xfebd0000,bar4對應的是0xfebd4000。
# cat /proc/iomem
08000000-febfffff : PCI Bus 0000:00
fd000000-fdffffff : 0000:00:02.0
fd000000-fdffffff : bochs-drm
feb80000-febbffff : 0000:00:03.0
febc0000-febcffff : 0000:00:02.0
febd0000-febd1fff : 0000:00:04.0
febd0000-febd1fff : nvme
febd2000-febd2fff : 0000:00:02.0
febd2000-febd2fff : bochs-drm
febd3000-febd3fff : 0000:00:03.0
febd4000-febd4fff : 0000:00:04.0
febd4000-febd4fff : nvme
febd5000-febd5fff : 0000:00:05.0
調用ioremap。前面說到bar0對應的物理地址是0xfebd0000,在linux中我們無法直接訪問物理地址,需要映射到虛擬地址,ioremap就是這個作用。映射完後,我們訪問dev->bar就可以直接操作nvme設備上的寄存器了。但是代碼中,並沒有根據pci_select_bars的返回值來決定映射哪個bar,而是直接hard code成映射bar0,原因是nvme協議中強制規定了bar0就是內存映射的基址。而bar4是自定義用途,暫時還不確定有什麼用。
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
PAGE_SIZE, PAGE_SIZE, 0);
if (!dev->prp_page_pool)
return -ENOMEM;
/* Optimisation for I/Os between 4k and 128k */
dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
256, 256, 0);
if (!dev->prp_small_pool) {
dma_pool_destroy(dev->prp_page_pool);
return -ENOMEM;
}
return 0;
}
回到nvme_probe來看nvme_setup_prp_pools,主要是分配了一些dma相關的東西。
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
const struct nvme_ctrl_ops *ops, unsigned long quirks)
{
int ret;
INIT_LIST_HEAD(&ctrl->namespaces);
mutex_init(&ctrl->namespaces_mutex);
kref_init(&ctrl->kref);
ctrl->dev = dev;
ctrl->ops = ops;
ctrl->quirks = quirks;
ret = nvme_set_instance(ctrl);
if (ret)
goto out;
ctrl->device = device_create_with_groups(nvme_class, ctrl->dev,
MKDEV(nvme_char_major, ctrl->instance),
dev, nvme_dev_attr_groups,
"nvme%d", ctrl->instance);
if (IS_ERR(ctrl->device)) {
ret = PTR_ERR(ctrl->device);
goto out_release_instance;
}
get_device(ctrl->device);
dev_set_drvdata(ctrl->device, ctrl);
ida_init(&ctrl->ns_ida);
spin_lock(&dev_list_lock);
list_add_tail(&ctrl->node, &nvme_ctrl_list);
spin_unlock(&dev_list_lock);
return 0;
out_release_instance:
nvme_release_instance(ctrl);
out:
return ret;
}
回到nvme_probe,nvme_init_ctrl裡主要做的事情就是通過device_create_with_groups創建一個名字叫nvme0的字符設備,也就是我們之前見到的。
crw------- 1 root root 253, 0 Jun 3 13:00 nvme0
再次回到nvme_probe,dev->reset_work被調度,也就是nvme_reset_work被調用了。
static void nvme_reset_work(struct work_struct *work)
{
struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
int result = -ENODEV;
if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags)))
goto out;
/*
* If we're called to reset a live controller first shut it down before
* moving on.
*/
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false);
set_bit(NVME_CTRL_RESETTING, &dev->flags);
result = nvme_pci_enable(dev);
if (result)
goto out;
result = nvme_configure_admin_queue(dev);
if (result)
goto out;
nvme_init_queue(dev->queues[0], 0);
result = nvme_alloc_admin_tags(dev);
if (result)
goto out;
result = nvme_init_identify(&dev->ctrl);
if (result)
goto out;
result = nvme_setup_io_queues(dev);
if (result)
goto out;
dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
result = nvme_dev_list_add(dev);
if (result)
goto out;
/*
* Keep the controller around but remove all namespaces if we don't have
* any working I/O queue.
*/
if (dev->online_queues < 2) {
dev_warn(dev->dev, "IO queues not created\n");
nvme_remove_namespaces(&dev->ctrl);
} else {
nvme_start_queues(&dev->ctrl);
nvme_dev_add(dev);
}
clear_bit(NVME_CTRL_RESETTING, &dev->flags);
return;
out:
nvme_remove_dead_ctrl(dev, result);
}
nvme_reset_work流程分析:
首先通過NVME_CTRL_RESETTING標志來確保nvme_reset_work不會被重復進入。 調用nvme_pci_enable。 調用nvme_configure_admin_queue。 調用nvme_init_queue 調用nvme_alloc_admin_tags 調用nvme_init_identify 調用nvme_setup_io_queues 調用nvme_dev_list_add
static int nvme_pci_enable(struct nvme_dev *dev)
{
u64 cap;
int result = -ENOMEM;
struct pci_dev *pdev = to_pci_dev(dev->dev);
if (pci_enable_device_mem(pdev))
return result;
dev->entry[0].vector = pdev->irq;
pci_set_master(pdev);
if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
goto disable;
if (readl(dev->bar + NVME_REG_CSTS) == -1) {
result = -ENODEV;
goto disable;
}
/*
* Some devices don't advertse INTx interrupts, pre-enable a single
* MSIX vec for setup. We'll adjust this later.
*/
if (!pdev->irq) {
result = pci_enable_msix(pdev, dev->entry, 1);
if (result < 0)
goto disable;
}
cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
dev->dbs = dev->bar + 4096;
/*
* Temporary fix for the Apple controller found in the MacBook8,1 and
* some MacBook7,1 to avoid controller resets and data loss.
*/
if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
dev->q_depth = 2;
dev_warn(dev->dev, "detected Apple NVMe controller, set "
"queue depth=%u to work around controller resets\n",
dev->q_depth);
}
if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2))
dev->cmb = nvme_map_cmb(dev);
pci_enable_pcie_error_reporting(pdev);
pci_save_state(pdev);
return 0;
disable:
pci_disable_device(pdev);
return result;
}
nvme_pci_enable流程分析:
調用pci_enable_device_mem來使能nvme設備的內存空間,也就是之前映射的bar0空間。之後就可以通過readl(dev->bar + NVME_REG_CSTS)來直接操作nvme設備上的控制寄存器了,也就是nvme協議中的如下這個表。
pci有兩種中斷模式,一種是INT,另一種是MSI。假如不支持INT模式的話,就使能MSI模式。在這裡使用的是INT模式,irq號為11。
# cat /proc/interrupts
CPU0
0: 86 IO-APIC 2-edge timer
1: 9 IO-APIC 1-edge i8042
4: 250 IO-APIC 4-edge serial
9: 0 IO-APIC 9-fasteoi acpi
10: 100 IO-APIC 10-fasteoi virtio1
11: 13 IO-APIC 11-fasteoi virtio0, nvme0q0, nvme0q1
12: 125 IO-APIC 12-edge i8042
14: 0 IO-APIC 14-edge ata_piix
15: 5 IO-APIC 15-edge ata_piix
從CAP寄存器中獲得一些配置參數,並把dev->dbs設置成dev->bar+4096。4096的由來是上面表裡doorbell寄存器的起始地址是0x1000。 假如nvme協議的版本大於等於1.2的話,需要調用nvme_map_cmb映射controller memory buffer。但是現在2.5版的qemu實現的nvme是1.1版的,所以這些不被支持。
回到nvme_reset_work分析nvme_configure_admin_queue。
static int nvme_configure_admin_queue(struct nvme_dev *dev)
{
int result;
u32 aqa;
u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
struct nvme_queue *nvmeq;
dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ?
NVME_CAP_NSSRC(cap) : 0;
if (dev->subsystem &&
(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
result = nvme_disable_ctrl(&dev->ctrl, cap);
if (result < 0)
return result;
nvmeq = dev->queues[0];
if (!nvmeq) {
nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
if (!nvmeq)
return -ENOMEM;
}
aqa = nvmeq->q_depth - 1;
aqa |= aqa << 16;
writel(aqa, dev->bar + NVME_REG_AQA);
lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
result = nvme_enable_ctrl(&dev->ctrl, cap);
if (result)
goto free_nvmeq;
nvmeq->cq_vector = 0;
result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
if (result) {
nvmeq->cq_vector = -1;
goto free_nvmeq;
}
return result;
free_nvmeq:
nvme_free_queues(dev, 0);
return result;
}
nvme_configure_admin_queue流程分析:
從CAP寄存器中獲悉對Subsystem Reset的支持 調用nvme_disable_ctrl 調用nvme_alloc_queue 調用nvme_enable_ctrl 調用queue_request_irq
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
{
int ret;
ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
ctrl->ctrl_config &= ~NVME_CC_ENABLE;
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
return nvme_wait_ready(ctrl, cap, false);
}
這裡的ctrl->ops就是之前nvme_init_ctrl時傳進去的nvme_pci_ctrl_ops,reg_write32通過NVME_REG_CC寄存器disable設備。
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
{
*val = readl(to_nvme_dev(ctrl)->bar + off);
return 0;
}
static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
{
writel(val, to_nvme_dev(ctrl)->bar + off);
return 0;
}
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.reg_read32 = nvme_pci_reg_read32,
.reg_write32 = nvme_pci_reg_write32,
.reg_read64 = nvme_pci_reg_read64,
.io_incapable = nvme_pci_io_incapable,
.reset_ctrl = nvme_pci_reset_ctrl,
.free_ctrl = nvme_pci_free_ctrl,
};
然後通過讀取狀態寄存器NVME_REG_CSTS來等待設備真正停止。超時上限是根據CAP寄存器的Timeout域來計算出來的,每個單位代表500ms。
static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
{
unsigned long timeout =
((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
int ret;
while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
if ((csts & NVME_CSTS_RDY) == bit)
break;
msleep(100);
if (fatal_signal_pending(current))
return -EINTR;
if (time_after(jiffies, timeout)) {
dev_err(ctrl->dev,
"Device not ready; aborting %s\n", enabled ?
"initialisation" : "reset");
return -ENODEV;
}
}
return ret;
}
回到nvme_configure_admin_queue分析nvme_alloc_queue。
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
int depth)
{
struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL);
if (!nvmeq)
return NULL;
nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
&nvmeq->cq_dma_addr, GFP_KERNEL);
if (!nvmeq->cqes)
goto free_nvmeq;
if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
goto free_cqdma;
nvmeq->q_dmadev = dev->dev;
nvmeq->dev = dev;
snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
dev->ctrl.instance, qid);
spin_lock_init(&nvmeq->q_lock);
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
nvmeq->q_depth = depth;
nvmeq->qid = qid;
nvmeq->cq_vector = -1;
dev->queues[qid] = nvmeq;
/* make sure queue descriptor is set before queue count, for kthread */
mb();
dev->queue_count++;
return nvmeq;
free_cqdma:
dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
nvmeq->cq_dma_addr);
free_nvmeq:
kfree(nvmeq);
return NULL;
}
nvme_alloc_queue流程分析:
調用dma_zalloc_coherent為completion queue分配內存以供DMA使用。nvmeq->cqes為申請到的內存的虛擬地址,供內核使用。而nvmeq->cq_dma_addr就是這塊內存的物理地址,供DMA控制器使用。 調用nvme_alloc_sq_cmd來處理submission queue,假如nvme版本是1.2或者以上的,並且cmb支持submission queue,那就使用cmb。要不然就和completion queue一樣使用dma_alloc_coherent來分配內存。
static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
int qid, int depth)
{
if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
dev->ctrl.page_size);
nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset;
nvmeq->sq_cmds_io = dev->cmb + offset;
} else {
nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
&nvmeq->sq_dma_addr, GFP_KERNEL);
if (!nvmeq->sq_cmds)
return -ENOMEM;
}
return 0;
}
再次回到nvme_configure_admin_queue,看看nvme_enable_ctrl。這個函數並沒有太多特別的,可以簡單理解為前面分析過的nvme_disable_ctrl的逆向操作。
int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
{
/*
* Default to a 4K page size, with the intention to update this
* path in the future to accomodate architectures with differing
* kernel and IO page sizes.
*/
unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
int ret;
if (page_shift < dev_page_min) {
dev_err(ctrl->dev,
"Minimum device page size %u too large for host (%u)\n",
1 << dev_page_min, 1 << page_shift);
return -ENODEV;
}
ctrl->page_size = 1 << page_shift;
ctrl->ctrl_config = NVME_CC_CSS_NVM;
ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
ctrl->ctrl_config |= NVME_CC_ENABLE;
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
return nvme_wait_ready(ctrl, cap, true);
}
回到nvme_configure_admin_queue,看最後一個函數queue_request_irq。這個函數主要的工作是設置中斷處理函數,默認情況下不使用線程化的中斷處理,而是使用中斷上下文的中斷處理。
static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
const char *name)
{
if (use_threaded_interrupts)
return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
nvme_irq_check, nvme_irq, IRQF_SHARED,
name, nvmeq);
return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
IRQF_SHARED, name, nvmeq);
}
一路返回到nvme_reset_work,分析nvme_init_queue。
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
{
struct nvme_dev *dev = nvmeq->dev;
spin_lock_irq(&nvmeq->q_lock);
nvmeq->sq_tail = 0;
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
dev->online_queues++;
spin_unlock_irq(&nvmeq->q_lock);
}
未完待續…