1、问题现象
在12盘服务器中,RAID1应该是sda,前面2块盘做了RAID1。安装RHEL6 x86_64系统,启动后发现通过fdisk查看到的盘符不是顺序的,而是乱序。
[root@localhost ~]# uname -a
Linux localhost.localdomain 2.6.32-71.el6.x86_64 #1 SMP Wed Sep 1 01:33:01 EDT 2010 x86_64 x86_64 x86_64 GNU/Linux
[root@localhost ~]# cat /etc/redhat-release
Red Hat Enterprise Linux Server release 6.0 (Santiago)
[root@localhost ~]#
(1)通过df –h或者fdisk这些需要使用驱动的工具看到的磁盘顺序是乱的,但lsscsi这些工具看底层的顺序是没问题的
root@r05d11019.yh.aliyun.com # df -h
Filesystem Size Used Avail Use% Mounted on
/dev/sda2 145G 1.7G 136G 2% /
/dev/sda1 485M 44M 417M 10% /boot
/dev/sda5 771G 197M 731G 1% /disk0
tmpfs 24G 0 24G 0% /dev/shm
/dev/sdh1 917G 200M 908G 1% /disk1
/dev/sdb1 917G 200M 908G 1% /disk2
/dev/sdk1 917G 200M 908G 1% /disk3
/dev/sde1 917G 200M 908G 1% /disk4
/dev/sdg1 917G 200M 908G 1% /disk5
/dev/sdd1 917G 200M 908G 1% /disk6
/dev/sdc1 917G 200M 908G 1% /disk
/dev/sdi1 917G 200M 908G 1% /disk8
/dev/sdf1 917G 200M 908G 1% /disk9
/dev/sdj1 917G 200M 908G 1% /disk10
应该是
/dev/sdb1 /disk1
/dev/sdc1 /disk2
依次类推
(2)原来fdisk的结果是
Disk /dev/sda: 1000.0 GB, 999999668224 bytes
Disk /dev/sdh: 1000.2 GB, 1000204886016 bytes
Disk /dev/sdb: 1000.2 GB, 1000204886016 bytes
Disk /dev/sdk: 1000.2 GB, 1000204886016 bytes
Disk /dev/sde: 1000.2 GB, 1000204886016 bytes
Disk /dev/sdg: 1000.2 GB, 1000204886016 bytes
Disk /dev/sdd: 1000.2 GB, 1000204886016 bytes
Disk /dev/sdc: 1000.2 GB, 1000204886016 bytes
Disk /dev/sdi: 1000.2 GB, 1000204886016 bytes
Disk /dev/sdf: 1000.2 GB, 1000204886016 bytes
Disk /dev/sdj: 1000.2 GB, 1000204886016 bytes
通过使用sort -k 2排序后
Disk /dev/sda: 1000.0 GB, 999999668224 bytes
Disk /dev/sdb: 1000.2 GB, 1000204886016 bytes
Disk /dev/sdc: 1000.2 GB, 1000204886016 bytes
Disk /dev/sdd: 1000.2 GB, 1000204886016 bytes
Disk /dev/sde: 1000.2 GB, 1000204886016 bytes
Disk /dev/sdf: 1000.2 GB, 1000204886016 bytes
Disk /dev/sdg: 1000.2 GB, 1000204886016 bytes
Disk /dev/sdh: 1000.2 GB, 1000204886016 bytes
Disk /dev/sdi: 1000.2 GB, 1000204886016 bytes
Disk /dev/sdj: 1000.2 GB, 1000204886016 bytes
Disk /dev/sdk: 1000.2 GB, 1000204886016 bytes
(3)lscsi结果顺序是无问题,硬盘盘符也与物理槽位顺序一致。
root@r05d11019.yh.aliyun.com # lsscsi
[0:0:0:0] disk ATA ST1000NM0011 SN02 –
[0:0:1:0] disk ATA ST1000NM0011 SN02 –
[0:0:2:0] disk ATA ST1000NM0011 SN02 /dev/sdb
[0:0:3:0] disk ATA ST1000NM0011 SN02 /dev/sdc
[0:0:4:0] disk ATA ST1000NM0011 SN02 /dev/sdd
[0:0:5:0] disk ATA ST1000NM0011 SN02 /dev/sde
[0:0:6:0] disk ATA ST1000NM0011 SN02 /dev/sdf
[0:0:7:0] disk ATA ST1000NM0011 SN02 /dev/sdg
[0:0:8:0] disk ATA ST1000NM0011 SN02 /dev/sdh
[0:0:9:0] disk ATA ST1000NM0011 SN02 /dev/sdi
[0:0:10:0] disk ATA ST1000NM0011 SN02 /dev/sdj
[0:0:11:0] disk ATA ST1000NM0011 SN02 /dev/sdk
[0:0:12:0] enclosu PMC 8399 1 –
[0:1:0:0] disk LSILOGIC Logical Volume 3000 /dev/sda
root@r05e08048.yh.aliyun.com # ./diskinfo
WARNING: Deprecated config file /etc/modprobe.conf, all config files belong into /etc/modprobe.d/.
=========IOCPort:0==========
Target SerialNum PhyNum OSDeviceName
—————————————–
1 ——– 1 ——–
2 Z1N0769E 2 /dev/sdb
3 Z1N05ZHB 3 /dev/sdc
4 Z1N07P9T 4 /dev/sdd
5 Z1N07CMF 5 /dev/sde
6 Z1N05D0X 6 /dev/sdf
7 Z1N07X2E 7 /dev/sdg
8 Z1N07CET 8 /dev/sdh
9 Z1N06JTT 9 /dev/sdi
10 Z1N06NNR 10 /dev/sdj
11 Z1N06NEJ 11 /dev/sdk
15 ——– 24 ——–
16 ——– 0 ——–
Hidden RAID Devices:
B___T Device Vendor Product Rev PhyNum SerialNum
0 1 PhysDisk 0 ATA ST1000NM0011 SN02 1
0 16 PhysDisk 1 ATA ST1000NM0011 SN02 0
2、乱序原因分析
我们首先分析fdisk的结果是从哪里读取的?通过strace fdisk -l跟踪,可以确定fdisk命令首先读取/proc/partitions文件,然后打开块设备读取详细信息。/proc/partitions文件中的块设备顺序,就是我们fdisk看到的盘符顺序。
[root@localhost ~]# fdisk -l |grep Disk
Disk /dev/sdj: 1000.0 GB, 999999668224 bytes
Disk identifier: 0x00000000
Disk /dev/sdh: 1000.2 GB, 1000204886016 bytes
Disk identifier: 0x000068e8
Disk /dev/sdf: 1000.2 GB, 1000204886016 bytes
Disk identifier: 0x00000000
Disk /dev/sde: 1000.2 GB, 1000204886016 bytes
Disk identifier: 0x329d8478
Disk /dev/sdi: 1000.2 GB, 1000204886016 bytes
Disk identifier: 0xb929dae9
Disk /dev/sdd: 1000.2 GB, 1000204886016 bytes
Disk identifier: 0x00000000
Disk /dev/sdc: 1000.2 GB, 1000204886016 bytes
Disk identifier: 0x000df6da
Disk /dev/sda: 1000.2 GB, 1000204886016 bytes
Disk identifier: 0x000934d8
Disk /dev/sdg: 1000.2 GB, 1000204886016 bytes
Disk identifier: 0x000cbd31
Disk /dev/md0: 644.3 GB, 644267114496 bytes
Disk identifier: 0x00000000
Disk /dev/hioa: 644.4 GB, 644396089344 bytes
Disk identifier: 0x1f933faa
Disk /dev/sdb: 1000.2 GB, 1000204886016 bytes
Disk identifier: 0x00062721
Disk /dev/sdk: 1000.2 GB, 1000204886016 bytes
Disk identifier: 0x000a0df5
[root@localhost ~]#
[root@localhost ~]# cat /proc/partitions
major minor #blocks name
8 144 976562176 sdj
8 112 976762584 sdh
8 113 314584798 sdh1
8 80 976762584 sdf
8 81 314584798 sdf1
8 64 976762584 sde
8 65 314584798 sde1
8 128 976762584 sdi
8 129 83891398 sdi1
8 130 83891430 sdi2
8 48 976762584 sdd
8 49 488287611 sdd1
8 50 488271577 sdd2
8 32 976762584 sdc
8 0 976762584 sda
8 1 51200000 sda1
8 2 2097152 sda2
8 96 976762584 sdg
8 97 314584798 sdg1
9 0 629167104 md0
252 0 629293056 hioa
8 16 976762584 sdb
8 17 976760001 sdb1
8 160 976762584 sdk
8 161 314584798 sdk1
[root@localhost ~]#
/proc/partitions文件中每条记录,都由show_partition()函数实现。该函数在文件block/genhd.c中。
00770: staticint show_partition(structseq_file*seqf,void*v)
00771: {
00772: structgendisk*sgp=v;
00773: structdisk_part_iterpiter;
00774: structhd_struct*part;
00775: charbuf[BDEVNAME_SIZE];
00776:
00777: / *Don’tshownon-partitionableremoveabledevicesoremptydevices*/
00778: if(!get_capacity(sgp)||(!disk_partitionable(sgp)&&
00779: (sgp–>flags & GENHD_FL_REMOVABLE)))
00780: return0;
00781: if(sgp–>flags& GENHD_FL_SUPPRESS_PARTITION_INFO)
00782: return0;
00783:
00784: / *showthefulldiskandallnon-0sizepartitionsofit*/
00785: disk_part_iter_init(&piter,sgp,DISK_PITER_INCL_PART0);
00786: while((part=disk_part_iter_next(&piter)))
00787: seq_printf(seqf,“%4d %7d%10llu%s\n“,
00788: MAJOR(part_devt(part)),MINOR(part_devt(part)),
00789: (unsignedlonglong)part–>nr_sects>> 1,
00790: disk_name(sgp,part–>partno,buf));
00791: disk_part_iter_exit(&piter);
00792:
00793: return0;
00794: }?endshow_partition?
这里我们不关心/proc/partitions文件读写时间及内核seq_file原理。简单地说,/proc/partitions读取的内容盘符顺序,就是内核控制器驱动加载时,加入硬盘盘符的顺序。
sd_probe()函数(在文件drivers/scsi/sd.c)负责将控制器扫描到的SCSI硬盘设备分配盘符,并加入系统中。在块设备驱动中,可以知道只有将硬盘设备调用add_disk()后,系统才可以真正看到和使用该硬盘设备。
在2.6.32内核中,sd_probe()函数发生了变化,即将硬盘设备加入系统中,是异步的方式,而不是采用同步的方式(见2378行)。这样就出现/dev/sda,/dev/sdb,… … 不是顺序加入内核系统中,而是异步的方式,/proc/partitions看到的结果就是乱序的。
02378: async_schedule(sd_probe_async,sdkp);
下面是硬盘设备真正加入内核系统时,内核调用栈。
tie the class to the device
device: ‘sda’: device_add
Pid: 508, comm: async/0 Not tainted 2.6.32-71.el6.debug #7
Call Trace:
[<ffffffff8132bf53>] device_add+0x5b3/0x690
[<ffffffff811dad21>] register_disk+0x41/0x170
[<ffffffff8124901c>] add_disk+0x8c/0x160
[<ffffffffa006434b>] sd_probe_async+0x13b/0x210 [sd_mod]
[<ffffffff81099042>] async_thread+0x102/0x250
[<ffffffff8105c490>] ? default_wake_function+0x0/0x20
[<ffffffff81098f40>] ? async_thread+0x0/0x250
[<ffffffff81091936>] kthread+0x96/0xa0
[<ffffffff810141ca>] child_rip+0xa/0x20
[<ffffffff810918a0>] ? kthread+0x0/0xa0
02312: staticint sd_probe(structdevice *dev)
02313: {
02314: structscsi_device*sdp=to_scsi_device(dev);
02315: structscsi_disk*sdkp;
02316: structgendisk*gd;
02317: u32index;
02318: interror;
02319:
02320: error=-ENODEV;
02321: if(sdp–>type !=TYPE_DISK&&sdp–>type !=TYPE_MOD &&sdp–>type !=
02321: TYPE_RBC)
02322: goto¯out;
02323:
02324: SCSI_LOG_HLQUEUE(3,sdev_printk(KERN_INFO,sdp,
02325: “sd_attach\n”));
02326:
02327: error=-ENOMEM;
02328: sdkp=kzalloc(sizeof(*sdkp),GFP_KERNEL);
02329: if(!sdkp)
02330: goto¯out;
02331:
02332: gd=alloc_disk(SD_MINORS);
02333: if(!gd)
02334: goto¯out_free;
02335:
02336: do{
02337: if(!ida_pre_get(&sd_index_ida,GFP_KERNEL))
02338: goto¯out_put;
02339:
02340: spin_lock(&sd_index_lock);
02341: error=ida_get_new(&sd_index_ida,&index);
02342: spin_unlock(&sd_index_lock);
02343: } while(error==-EAGAIN);
02344:
02345: if(error)
02346: goto¯out_put;
02347:
02348: error=sd_format_disk_name(“sd“,index,gd–>disk_name,DISK_NAME_LEN);
02349: if(error)
02350: goto¯out_free_index;
02351:
02352: sdkp–>device= sdp;
02353: sdkp–>driver= &sd_template;
02354: sdkp–>disk= gd;
02355: sdkp–>index=index;
02356: sdkp–>openers= 0;
02357: sdkp–>previous_state= 1;
02358:
02359: if(!sdp–>request_queue->rq_timeout){
02360: if(sdp–>type !=TYPE_MOD)
02361: blk_queue_rq_timeout(sdp–>request_queue,SD_TIMEOUT);
02362: else
02363: blk_queue_rq_timeout(sdp–>request_queue,
02364: SD_MOD_TIMEOUT);
02365: }
02366:
02367: device_initialize(&sdkp–>dev);
02368: sdkp–>dev.parent= &sdp–>sdev_gendev;
02369: sdkp–>dev.class=&sd_disk_class;
02370: dev_set_name(&sdkp–>dev,dev_name(&sdp–>sdev_gendev));
02371:
02372: if(device_add(&sdkp–>dev))
02373: goto¯out_free_index;
02374:
02375: get_device(&sdp–>sdev_gendev);
02376:
02377: get_device(&sdkp–>dev); /*preventreleasebeforeasync_schedule*/
02378: async_schedule(sd_probe_async,sdkp);
02379:
02380: return0;
02381:
02382: out_free_index:
02383: spin_lock(&sd_index_lock);
02384: ida_remove(&sd_index_ida,index);
02385: spin_unlock(&sd_index_lock);
02386: out_put:
02387: put_disk(gd);
02388: out_free:
02389: kfree(sdkp);
02390: out:
02391: returnerror;
02392: }?endsd_probe?
02393:
sd_probe_async()函数也在block/genhd.c文件中。
2238: /*
02239: *Theasynchronouspartofsd_probe
02240: */
02241: staticvoid sd_probe_async(void*data,async_cookie_t cookie)
02242: {
02243: structscsi_disk*sdkp=data;
02244: structscsi_device*sdp;
02245: structgendisk*gd;
02246: u32index;
02247: structdevice *dev;
02248:
02249: sdp=sdkp–>device;
02250: gd=sdkp–>disk;
02251: index=sdkp–>index;
02252: dev=&sdp–>sdev_gendev;
02253:
02254: if(index<SD_MAX_DISKS){
02255: gd–>major =sd_major((index&0xf0)>>4);
02256: gd–>first_minor= ((index&0xf)<<4)|(index&0xfff00);
02257: gd–>minors= SD_MINORS;
02258: }
02259: gd–>fops= &sd_fops;
02260: gd–>private_data= &sdkp–>driver;
02261: gd–>queue=sdkp–>device->request_queue;
02262:
02263: / *defaults,untilthedevicetellsusotherwise*/
02264: sdp–>sector_size=512;
02265: sdkp–>capacity= 0;
02266: sdkp–>media_present= 1;
02267: sdkp–>write_prot= 0;
02268: sdkp–>WCE =0;
02269: sdkp–>RCD= 0;
02270: sdkp–>ATO= 0;
02271: sdkp–>first_scan= 1;
02272:
02273: sd_revalidate_disk(gd);
02274:
02275: blk_queue_prep_rq(sdp–>request_queue,sd_prep_fn);
02276: blk_queue_unprep_rq(sdp–>request_queue,sd_unprep_fn);
02277:
02278: gd–>driverfs_dev= &sdp–>sdev_gendev;
02279: gd–>flags= GENHD_FL_EXT_DEVT |GENHD_FL_DRIVERFS;
02280: if(sdp–>removable)
02281: gd–>flags |=GENHD_FL_REMOVABLE;
02282:
02283: dev_set_drvdata(dev,sdkp);
02284: add_disk(gd);
02285: sd_dif_config_host(sdkp);
02286:
02287: sd_revalidate_disk(gd);
02288:
02289: sd_printk(KERN_NOTICE,sdkp,“AttachedSCSI%sdisk\n“,
02290: sdp–>removable?“removable”:“”);
02291: put_device(&sdkp–>dev);
02292: }?endsd_probe_async?
楼主那怎么解决的呢。
可以解决,但需要修改内核SCSI模块。即将sd_probe()函数中的 async_schedule(sd_probe_async,sdkp);修改为2.6.18内核加载硬盘方式。
楼主,你好,想请教一个问题:AS6.0的内核检查磁盘分配盘符时,是优先分配主板板载接口的磁盘还是优先从PCI设备(比如RAID控制器下的逻辑盘),开始检查分配盘符。这个扫描顺序是由哪个函数控制的。非常感谢。
一般主板板载接口的硬盘控制器与PCI接口的控制器厂商不同,驱动也不同。
分配盘符顺序以加载各自的控制器驱动先后顺序为准。可以在initrd文件中,rc.sysinit或其他启动脚本(发行版本不同,有所区别)修改加载驱动顺序。
在RHEL6.0中,修改initramfs文件中的init脚本,就可以调整硬盘控制器驱动加载顺序。
直接在init文件中加入如下命令(建议在文件前面)
modprobe ahci
modprobe mptsas
解压initramfs文件如下:
[root@hadoop-03 initrd]#gzip -dc initramfs-2.6.32-220.el6.img |cpio -idm
[root@hadoop-03 initrd]# ls
bin dev emergency init initqueue-finished lib mount pre-trigger proc sys tmp var
cmdline dracut-004-53.el6 etc initqueue initqueue-settled lib64 pre-pivot pre-udev sbin sysroot usr
楼主,您好!请教一下
加载各自的控制器驱动先后顺序,这个顺序在每次检测时有控制机制吗?如果不指定启动顺序每次重启都有可能发生顺序的变化?比如本地磁盘的和外部存储的驱动,如果不指定加载顺序,每次都有可能发生变化?
不同控制器,驱动不同。以加载驱动先后顺序为准。加载驱动顺序是固定的,也可以调整。