在x86架构中,我们对Time Stamp Counter (TSC) 寄存器非常熟悉,通过这个寄存器对代码执行时间的衡量可精确到CPU Cycle级别。
但在ARM/ARMv8/aarch64架构中,并没有与x86 TSC对应的寄存器和直接对应的汇编指令rdtsc。
若想在ARMv8架构中,统计计算代码执行时间达到CPU Cycle级别,也需要读取类似x86的TSC寄存器。在ARMv8中,有Performance Monitors Control Register系列寄存器,其中PMCCNTR_EL0就类似于x86的TSC寄存器。本文介绍Linux下读取ARM TSC方法。
读取这个PMCCNTR_EL0寄存器值,就可以知道当前CPU已运行了多少Cycle。但在ARM下读取CPU Cycle和x86有所不同:
1、x86用户态代码可以随便读取TSC值。但在ARM,默认情况是用户态是不可以读的,需要在内核态使能后,用户态才能读取。
开关在由寄存器PMCR_EL0控制。实际上这个寄存器控制整个PMU寄存器在用户态是否可读写,不仅仅是PMCCNTR_EL0。
在内核态使能,可以是编写单独内核模块,也可以在内核代码任意被执行的位置加上设置使能PMU寄存器代码即可。Linux下使能(Enable)用户态访问PMU内核模块代码:
/* * Enable user-mode ARM performance counter access. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/smp.h> #define PERF_DEF_OPTS (1 | 16) #define PERF_OPT_RESET_CYCLES (2 | 4) #define PERF_OPT_DIV64 (8) #define ARMV8_PMCR_MASK 0x3f #define ARMV8_PMCR_E (1 << 0) /* Enable all counters */ #define ARMV8_PMCR_P (1 << 1) /* Reset all counters */ #define ARMV8_PMCR_C (1 << 2) /* Cycle counter reset */ #define ARMV8_PMCR_D (1 << 3) /* CCNT counts every 64th cpu cycle */ #define ARMV8_PMCR_X (1 << 4) /* Export to ETM */ #define ARMV8_PMCR_DP (1 << 5) /* Disable CCNT if non-invasive debug*/ #define ARMV8_PMCR_LC (1 << 6) /* Cycle Counter 64bit overflow*/ #define ARMV8_PMCR_N_SHIFT 11 /* Number of counters supported */ #define ARMV8_PMCR_N_MASK 0x1f #define ARMV8_PMUSERENR_EN_EL0 (1 << 0) /* EL0 access enable */ #define ARMV8_PMUSERENR_CR (1 << 2) /* Cycle counter read enable */ #define ARMV8_PMUSERENR_ER (1 << 3) /* Event counter read enable */ static inline u32 armv8pmu_pmcr_read(void) { u64 val=0; asm volatile("mrs %0, pmcr_el0" : "=r" (val)); return (u32)val; } static inline void armv8pmu_pmcr_write(u32 val) { val &= ARMV8_PMCR_MASK; isb(); asm volatile("msr pmcr_el0, %0" : : "r" ((u64)val)); } static inline long long armv8_read_CNTPCT_EL0(void) { long long val; asm volatile("mrs %0, CNTVCT_EL0" : "=r" (val)); return val; } static void enable_cpu_counters(void* data) { u32 val=0; asm volatile("msr pmuserenr_el0, %0" : : "r"(0xf)); armv8pmu_pmcr_write(ARMV8_PMCR_LC|ARMV8_PMCR_E); asm volatile("msr PMCNTENSET_EL0, %0" :: "r" ((u32)(1<<31))); armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMCR_E|ARMV8_PMCR_LC); printk("\nCPU:%d ", smp_processor_id()); } static void disable_cpu_counters(void* data) { u32 val=0; printk(KERN_INFO "\ndisabling user-mode PMU access on CPU #%d", smp_processor_id()); /* Program PMU and disable all counters */ armv8pmu_pmcr_write(armv8pmu_pmcr_read() |~ARMV8_PMCR_E); asm volatile("msr pmuserenr_el0, %0" : : "r"((u64)0)); } static int __init init(void) { u64 cval; u32 val; isb(); asm volatile("mrs %0, PMCCNTR_EL0" : "=r"(cval)); printk("\nCPU Cycle count:%llu \n", cval); asm volatile("mrs %0, PMCNTENSET_EL0" : "=r"(val)); printk("PMCNTENSET_EL0:%lX ", val); asm volatile("mrs %0, PMCR_EL0" : "=r"(val)); printk("\nPMCR_EL0 Register:%lX ", val); on_each_cpu(enable_cpu_counters, NULL, 1); printk(KERN_INFO "Enable Access PMU Initialized"); return 0; } static void __exit fini(void) { on_each_cpu(disable_cpu_counters, NULL, 1); printk(KERN_INFO "Access PMU Disabled"); } module_init(init); module_exit(fini);
2、x86下TSC的值,在CPU上电后就开始累加,且是只读寄存器。但在ARM中,只有使能PMCCNTR_EL0后,TSC才开始累加计数,且PMCCNTR_EL0寄存器可清零,相当于计时器。
用户态读取ARMv8 PMU寄存器代码:
#include <stdio.h> #include <sys/time.h> #include <unistd.h> /* All counters, including PMCCNTR_EL0, are disabled/enabled */ #define QUADD_ARMV8_PMCR_E (1 << 0) /* Reset all event counters, not including PMCCNTR_EL0, to 0 */ #define QUADD_ARMV8_PMCR_P (1 << 1) /* Reset PMCCNTR_EL0 to 0 */ #define QUADD_ARMV8_PMCR_C (1 << 2) /* Clock divider: PMCCNTR_EL0 counts every clock cycle/every 64 clock cycles */ #define QUADD_ARMV8_PMCR_D (1 << 3) /* Export of events is disabled/enabled */ #define QUADD_ARMV8_PMCR_X (1 << 4) /* Disable cycle counter, PMCCNTR_EL0 when event counting is prohibited */ #define QUADD_ARMV8_PMCR_DP (1 << 5) /* Long cycle count enable */ #define QUADD_ARMV8_PMCR_LC (1 << 6) #define ARMV8_PMCR_MASK 0x3f /* Mask for writable bits */ static inline unsigned int armv8_pmu_pmcr_read(void) { unsigned int val; /* Read Performance Monitors Control Register */ asm volatile("mrs %0, pmcr_el0" : "=r" (val)); return val; } static inline void armv8_pmu_pmcr_write(unsigned int val) { asm volatile("msr pmcr_el0, %0" : :"r" (val & ARMV8_PMCR_MASK)); } static inline long long armv8_read_CNTPCT_EL0(void) { long long val; asm volatile("mrs %0, CNTVCT_EL0" : "=r" (val)); return val; } static void enable_all_counters(void) { return; unsigned int val; /* Enable all counters */ val = armv8_pmu_pmcr_read(); val |= QUADD_ARMV8_PMCR_E | QUADD_ARMV8_PMCR_X; armv8_pmu_pmcr_write(val); } static void reset_all_counters(void) { return ; unsigned int val; val = armv8_pmu_pmcr_read(); val |= QUADD_ARMV8_PMCR_P | QUADD_ARMV8_PMCR_C; armv8_pmu_pmcr_write(val); } static unsigned int enabled=0; unsigned int readticks(unsigned int *result) { struct timeval t; unsigned int cc; unsigned int val; if (!enabled) { reset_all_counters(); enable_all_counters(); enabled = 1; } cc = armv8_pmu_pmcr_read(); gettimeofday(&t,(struct timezone *) 0); result[0] = cc; result[1] = t.tv_usec; result[2] = t.tv_sec; return cc; } static inline unsigned int armv8pmu_pmcr_read(void) { unsigned int val; asm volatile("mrs %0, pmcr_el0" : "=r" (val)); return val; } #define u32 unsigned int #define u64 unsigned long long #define isb() asm volatile("isb" : : : "memory") static inline u64 arch_counter_get_cntpct(void) { u64 cval; isb(); asm volatile("mrs %0, PMCCNTR_EL0" : "+r"(cval)); return cval; } int main() { unsigned int start,end; unsigned int result[3]; unsigned long long timer; u32 pmcr_el; pmcr_el = armv8pmu_pmcr_read(); printf("\nPMCR_EL0 Register:%lX ", pmcr_el); timer = arch_counter_get_cntpct(); printf("\nCPU Cycle Count:0x%llX ",timer); sleep(5); timer = arch_counter_get_cntpct(); printf("\nCPU Cycle Count:0x%llX \n",timer); asm volatile("mrs %0, PMOVSCLR_EL0" : "=r"(pmcr_el)); printf(" Register PMOVSCLR_EL0:0x%lX \n", pmcr_el); asm volatile("mrs %0, pmuserenr_el0" : "=r"(pmcr_el)); printf(" Register pmuserenr_el0:0x%lX \n", pmcr_el); asm volatile("mrs %0, PMCNTENSET_EL0" : "=r"(pmcr_el)); printf(" Register PMCNTENSET_EL0:0x%lX \n", pmcr_el); asm volatile("mrs %0, PMCCFILTR_EL0" : "=r"(pmcr_el)); printf(" Register PMCCFILTR_EL0:0x%lX \n", pmcr_el); asm volatile("mrs %0, PMCNTENCLR_EL0" : "=r"(pmcr_el)); printf(" Register PMCNTENCLR_EL0:0x%lX \n", pmcr_el); asm volatile("mrs %0, PMOVSSET_EL0" : "=r"(pmcr_el)); printf(" Register PMOVSSET_EL0:0x%lX \n", pmcr_el); return 0; }
Table 11-4 PMCR_EL0 bit assignments
Bits | Name | Function | ||||
---|---|---|---|---|---|---|
[31:24] | IMP |
Implementer code:
This is a read-only field.
|
||||
[23:16] | IDCODE |
Identification code:
This is a read-only field.
|
||||
[15:11] | N |
Number of event counters.
In Non-secure modes other than Hyp mode, this field reads the value of HDCR.HPMN. See 4.5.12 Hyp Debug Control Register.
In Secure state and Hyp mode, this field returns
that indicates the number of counters implemented.This is a read-only field.
|
||||
[10:7] | – | Reserved, RES0. | ||||
[6] | LC |
Long cycle count enable. Selects which PMCCNTR_EL0 bit generates an overflow recorded in PMOVSR[31]:
|
||||
[5] | DP |
Disable cycle counter, PMCCNTR_EL0 when event counting is prohibited:
This bit is read/write.
|
||||
[4] | X |
Export enable. This bit permits events to be exported to another debug device, such as a trace macrocell, over an event bus:
This bit is read/write and does not affect the generation of Performance Monitors interrupts, that can be implemented as a signal exported from the processor to an interrupt controller.
|
||||
[3] | D |
Clock divider:
This bit is read/write.
|
||||
[2] | C |
Clock counter reset:
NoteResetting PMCCNTR does not clear the PMCCNTR_EL0 overflow bit to 0. See the ARM® Architecture Reference Manual ARMv8 for more information. This bit is write-only, and always RAZ.
|
||||
[1] | P |
Event counter reset:
In Non-secure modes other than Hyp mode, a write of 1 to this bit does not reset event counters that the HDCR.HPMN field reserves for Hyp mode use. See 4.5.12 Hyp Debug Control Register.
In Secure state and Hyp mode, a write of 1 to this bit resets all the event counters.
|
||||
[0] | E |
Enable bit. This bit does not disable or enable, counting by event counters reserved for Hyp mode by HDCR.HPMN. It also does not suppress the generation of performance monitor overflow interrupt requests by those counters:
This bit is read/write.
|
示例代码下载:read aarch64 TSC
正一,你好:
我从x86到arm64迁移,遇到tsc 的问题,x86平台的实现如下:
#define rdtsc(low,high) __asm__ __volatile(“rdtsc”:”=a”(low), “=d”(high))
请教一下,1. 如何获取timer中的高位32和低位32?获取高32位和低32位是否符合以上函数的要求呢?例如执行结果中CPU Cycle Count:0x136C23FE71D,位数很多。
2. mian函数中timer后面的一系列asm volatile操作,是否是必须?
您给出的源码下载有乱码,麻烦您再传给我一份,感谢~~
1、低32位和高32位获取问题,网上有很多参考代码。建议阅读《深入理解计算机系系统》(computer system a programmer’s perspective),这本书有详细介绍。
2、源码乱码问题,请尝试dos2unix试一下转换文件。
请问,你提出的问题解决了吗,我现在也有一个和您同样的问题
遇到了具体什么问题?
就是请问高32位和低32位如何取出来呢?
你好,我也遇到类似问题,请问您解决了吗
你好,
我在跑测试程序的时候有遇到 Illegal instruction,似乎在用户态还是不能访问 pmcr_el0 和 PMCCNTR_EL0。但 insmod 之后已经有显示 Enable Access PMU Initialized。请问我是哪里弄错了吗?我在 Jetson Nano 上进行的测试。
你好,我也遇到了同样的问题,请问是否解决?
你好 我也遇到了 请问有解决嘛
你好 请问解决了使用msr 时的illegal instruction嘛 有偿求助
博主提供的内核模块有问题,实际并没有打开用户态访问PMU register的权限,而是仅仅通过PMCR寄存器使能counter开始计数,所以EL0会出现 Illegal instruction。
实际需要配置pmuserenr_el0。
https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/PMUSERENR-EL0–Performance-Monitors-User-Enable-Register
你好, X86的rdtsc是多核之间都是同步的, 不同核的时间戳可以相减, ARM的PMCCNTR_EL0寄存器在各个核之间是同步的吗??
两个tsc时间差是什么单位呢?怎么将这个差值转换为纳秒呢