iximeow/msr_peek.rs

## msr_peek.rs
//! # hey! you're curious about the characteristics of reading various MSRs? me too!
//!
//! this reads a few registers (TSC, AMD RAPL core and package power) 2M times and reports a bit
//! about the adventure. you need root wherever you run it. runs on linux (need the `msr` module
//! loaded) or illumos.
//!
//! you probably want to bind this to a single CPU when run. if you don't, it might work
//! reasonably, or if it's task-switched between cores while running, core-specific MSRs (RAPL core
//! power) will have nonsense skew.
//!
//! on linux that's something like `sudo taskset -c 0 ./msr_peek`
//! on illumos that's something like `pfexec pbind -e 0 ./msr_peek`
//!
//! this is written to be buildable as a standalone .rs. you can build it with cargo, if you want.
//! but you can also `rustc -C opt-level=3 msr_peek.rs`.
//!
//! # results
//!
//! ## 3950x/linux
//!
//! [13:54:06] # iximeow:~> sudo taskset -c 0 ./msr_peek
//! [sudo] password for iximeow:
//! read 2000000 samples in 7.683337ms (3ns each)
//! /!\ MSR 0x000010, TSC /!\
//! read 2000000 samples in 435.246707ms (217ns each)
//! changed by 3495367.75 per ms (total diff: 1520484945)
//! averaged 1.00 consecutive read per change, for 217ns per change
//! average change per tick: 760.24
//! /!\ MSR 0xc001029a, RAPL Core Power /!\
//! read 2000000 samples in 484.547539ms (242ns each)
//! changed by 510.72 per ms (total diff: 247189)
//! averaged 4123.71 consecutive reads per change, for 997766ns per change
//! average change per tick: 510.72
//! /!\ MSR 0xc001029b, RAPL Package Power /!\
//! read 2000000 samples in 558.854677ms (279ns each)
//! changed by 3413.75 per ms (total diff: 1904871)
//! averaged 3577.82 consecutive reads per change, for 997983ns per change
//! average change per tick: 3413.75
//!
//! ## 7950x/illumos
//!
//! root@helios:/rpool/devel/oxide/msr_peek# pbind -e 0 ./target/release/msr_peek
//! read 2000000 samples in 8.852352ms (4ns each)
//! /!\ MSR 0x000010, TSC /!\
//! read 2000000 samples in 716.010781ms (358ns each)
//! changed by 4499621.50 per ms (total diff: 3221729010)
//! averaged 1.00 consecutive read per change, for 358ns per change
//! average change per tick: 1610.87
//! /!\ MSR 0xc001029a, RAPL Core Power /!\
//! read 2000000 samples in 939.279162ms (469ns each)
//! changed by 10083.45 per ms (total diff: 9468358)
//! averaged 2127.66 consecutive reads per change, for 997563ns per change
//! average change per tick: 10083.45
//! /!\ MSR 0xc001029b, RAPL Package Power /!\
//! read 2000000 samples in 851.845292ms (425ns each)
//! changed by 3044.61 per ms (total diff: 2590965)
//! averaged 2344.67 consecutive reads per change, for 996200ns per change
//! average change per tick: 3041.04
//!
//! ## 7713p/illumos
//!
//! # pbind -e 0 ./msr_peek
//! read 2000000 samples in 11.602298ms (5ns each)
//! /!\ MSR 0x000010, TSC /!\
//! read 2000000 samples in 942.356169ms (471ns each)
//! changed by 1996969.00 per ms (total diff: 1881144860)
//! averaged 1.00 consecutive read per change, for 471ns per change
//! average change per tick: 940.57
//! /!\ MSR 0xc001029a, RAPL Core Power /!\
//! read 2000000 samples in 1.040690635s (520ns each)
//! changed by 209.42 per ms (total diff: 217794)
//! averaged 1924.93 consecutive reads per change, for 1000480ns per change
//! average change per tick: 209.82
//! /!\ MSR 0xc001029b, RAPL Package Power /!\
//! read 2000000 samples in 1.167764682s (583ns each)
//! changed by 5661.90 per ms (total diff: 6607439)
//! averaged 1715.27 consecutive reads per change, for 999845ns per change
//! average change per tick: 5671.62
//!
//! # remarks
//!
//! TSC counts at just about the core frequency. no surprise here. TSC updates on every read,
//! that's minorly interesting!
//!
//! in both cases reads go though some layers of abstraction to actually issue an `rdmsr`, so the
//! floor here is not reflective of the hardware's responsiveness to `rdmsr`. it's a syscall,
//! virtual filesystem translation, indirect calls therein, an rdmsr, then any syscall exit
//! overhead w.r.t uarch state flushing for the kernel/user boundary. a TSC read is pretty fast
//! though, and changes between that and other MSRs are probably actually due to the MSR choice.
//!
//! yes, on a 7950x the package power MSR is faster to read than the core power MSR. surprise to
//! me!
//!
//! RAPL registers update about once a millisecond. this is consistent with advice that reads of
//! RAPL registers must ensure that measurements are taken more than 1ms apart so as to be
//! meaningful.
//!
//! RAPL registers count in units of 1/65536 joules. the core being used here is under ~100% load,
//! so one can expect it to be in its highest power state. RAPL-reported draw on:
//! * 3950x: 7.79W core/52.1W package. this system was otherwise busy, other cores active, etc.
//! * 7950x: 153.86W core/46.4W package. this system was otherwise idle.
//!   - note core and package power are measured in separate steps. this measurement suggests
//!     that measuring package power momentarily interferes with turbo, and is preventing the
//!     busy core from being kept in its highest-power state.
//! * 7713p: 3.2W core/86.5W package. this system was otherwise idle.
//!   - server cores don't get clocked terribly high even under load. not too surprising here.

use std::fmt::Write;
use std::os::fd::IntoRawFd;
use std::os::fd::RawFd;
use std::time::Duration;

// 2M samples. reading in a tight loop this might take a second or two if we're looking at a full
// microsecond per read. it's an ioctl though, so, i guess we'll see what we see.
const SAMPLE_COUNT: usize = 1000 * 1000 * 2;

#[derive(Copy, Clone)]
struct MsrDesc {
    msr_nr: u64,
    name: &'static str,
}

struct MsrCtx {
    msr_fd: RawFd,
}

impl MsrCtx {
    fn new() -> Self {
        #[cfg(target_os = "illumos")]
        {
            // happens to be "self" as time of query. if we get switched to another CPU, we'll be
            // checking someone else's MSRs with IPIs. sppoky!!! pbind to a single cpu for best
            // results.
            let cpu_file = std::fs::File::open("/dev/cpu/self/cpuid").expect("can open cpuid fd");
            return Self {
                msr_fd: cpu_file.into_raw_fd(),
            };
        };

        #[cfg(target_os = "linux")]
        {
            // arbitrarily pick cpu 0. run on CPU 0 for best results.
            let cpu_file = std::fs::File::open("/dev/cpu/0/msr").expect("can open msr fd");
            return Self {
                msr_fd: cpu_file.into_raw_fd(),
            };
        };

        #[cfg(not(any(target_os = "illumos", target_os = "linux")))]
        panic!("unsupported target OS");
    }

    #[inline(always)]
    fn rdmsr(&self, msr: u64) -> u64 {
        #[cfg(target_os = "illumos")]
        {
            // trusting the system can provide an ioctl here instead of depending on libc and
            // requiring cargo to build this file...
            extern "C" {
                // `request` is an `int` on illumos, `unsigned long` on linux.
                fn ioctl(
                    fd: std::os::fd::RawFd,
                    op: std::ffi::c_int,
                    param: *mut std::ffi::c_void,
                ) -> u64;
            }

            const CPUID_IOC: i32 =
                ((b'c' as i32) << 24) | ((b'i' as i32) << 16) | ((b'd' as i32) << 8);
            const CPUID_RDMSR: i32 = CPUID_IOC | 1;

            #[repr(C)]
            struct CpuidRdmsr {
                msr_nr: u64,
                msr_val: u64,
            }

            let mut crm: CpuidRdmsr = CpuidRdmsr {
                msr_nr: msr,
                msr_val: 0,
            };

            let res = unsafe {
                ioctl(
                    self.msr_fd,
                    CPUID_RDMSR,
                    &mut crm as *mut CpuidRdmsr as *mut std::ffi::c_void,
                )
            };
            assert_eq!(res, 0);

            return crm.msr_val as u64;
        }
        #[cfg(target_os = "linux")]
        {
            extern "C" {
                fn pread(
                    fd: std::os::fd::RawFd,
                    buf: *mut std::ffi::c_void,
                    count: isize,
                    offset: u64,
                ) -> isize;
            }

            let mut v: u64 = 0;

            let res = unsafe {
                pread(
                    self.msr_fd,
                    &mut v as *mut u64 as *mut [u8; 8] as *mut std::ffi::c_void,
                    8,
                    msr,
                )
            };
            assert_eq!(res, 8);

            return v;
        }

        #[cfg(not(any(target_os = "illumos", target_os = "linux")))]
        panic!("unsupported target OS");
    }
}

const TSC: MsrDesc = MsrDesc {
    msr_nr: 0x00000010,
    name: "TSC",
};

const CORE_POWER: MsrDesc = MsrDesc {
    msr_nr: 0xC001_029A,
    name: "RAPL Core Power",
};

const PACKAGE_POWER: MsrDesc = MsrDesc {
    msr_nr: 0xC001_029B,
    name: "RAPL Package Power",
};

fn sample_msr(msr_ctx: &MsrCtx, msr: MsrDesc, samples: usize) -> (Vec<u64>, Duration) {
    let mut buf = Vec::with_capacity(samples);

    let start = std::time::Instant::now();

    for _ in 0..samples {
        buf.push(msr_ctx.rdmsr(msr.msr_nr));
    }

    (buf, start.elapsed())
}

fn no_op(_msr_ctx: &MsrCtx, msr: MsrDesc, samples: usize) -> (Vec<u64>, Duration) {
    let mut buf = Vec::with_capacity(samples);

    let start = std::time::Instant::now();

    for _ in 0..samples {
        buf.push(msr.msr_nr);
    }

    (buf, start.elapsed())
}

// in case you want to write out all the samples from an MSR...
#[allow(dead_code)]
fn stringify(samples: Vec<u64>) -> String {
    let mut out = String::new();
    for sample in samples.iter() {
        writeln!(out, "{}", sample).expect("works");
    }
    out
}

fn show_stats(msr: MsrDesc, samples: Vec<u64>, duration: Duration) {
    println!("/!\\ MSR {:#08x}, {} /!\\", msr.msr_nr, msr.name);
    let time_per_sample = duration / samples.len() as u32;
    println!(
        "read {} samples in {:?} ({:?} each)",
        samples.len(),
        duration,
        time_per_sample
    );
    let first_last_diff = samples[samples.len() - 1] - samples[0];
    let diff_per_ms = (first_last_diff as f32) / (duration.as_millis() as f32);
    println!(
        "changed by {:.2} per ms (total diff: {})",
        diff_per_ms, first_last_diff
    );
    let min = samples.iter().min().expect("has a min");
    let max = samples.iter().max().expect("has a max");
    if max - min != first_last_diff {
        eprintln!("! first was not min or last was not max (wraparound while running?)");
    }

    // consecutive reads may return the same value. some MSRs note that reads must be spaced more
    // than 1ms apart to compute us / reads_per_tick.len()ful deltas. validate that here.
    let mut reads_per_tick = Vec::new();
    let mut diff_per_tick = Vec::new();
    let mut sample_iter = samples.iter();
    let mut prev = sample_iter.next().expect("has a sample");
    let mut reads_this_tick = 1;

    for sample in sample_iter {
        if sample == prev {
            reads_this_tick += 1;
        } else {
            if sample > prev {
                diff_per_tick.push((sample - prev) as f64);
            }
            reads_per_tick.push(reads_this_tick as f64);
            reads_this_tick = 1;
        }

        prev = sample;
    }
    reads_per_tick.push(reads_this_tick as f64);
    let avg_reads_per_tick = reads_per_tick.iter().sum::<f64>() / reads_per_tick.len() as f64;
    println!(
        "averaged {:0.2} consecutive {} per change, for {:?}ns per change",
        avg_reads_per_tick,
        if avg_reads_per_tick == 1.0 {
            "read"
        } else {
            "reads"
        },
        (time_per_sample * avg_reads_per_tick as u32).as_nanos()
    );
    println!(
        "average change per tick: {:0.2}",
        diff_per_tick.iter().sum::<f64>() / diff_per_tick.len() as f64
    );
}

pub fn main() {
    let msr_ctx = MsrCtx::new();
    let (no_op_samples, no_op_duration) = no_op(&msr_ctx, TSC, SAMPLE_COUNT);
    println!(
        "read {} samples in {:?} ({:?} each)",
        no_op_samples.len(),
        no_op_duration,
        no_op_duration / no_op_samples.len() as u32
    );
    let (tsc_samples, tsc_duration) = sample_msr(&msr_ctx, TSC, SAMPLE_COUNT);
    show_stats(TSC, tsc_samples, tsc_duration);
    let (samples, duration) = sample_msr(&msr_ctx, CORE_POWER, SAMPLE_COUNT);
    show_stats(CORE_POWER, samples, duration);
    let (samples, duration) = sample_msr(&msr_ctx, PACKAGE_POWER, SAMPLE_COUNT);
    show_stats(PACKAGE_POWER, samples, duration);
}
	//! # hey! you're curious about the characteristics of reading various MSRs? me too!
	//!
	//! this reads a few registers (TSC, AMD RAPL core and package power) 2M times and reports a bit
	//! about the adventure. you need root wherever you run it. runs on linux (need the `msr` module
	//! loaded) or illumos.
	//!
	//! you probably want to bind this to a single CPU when run. if you don't, it might work
	//! reasonably, or if it's task-switched between cores while running, core-specific MSRs (RAPL core
	//! power) will have nonsense skew.
	//!
	//! on linux that's something like `sudo taskset -c 0 ./msr_peek`
	//! on illumos that's something like `pfexec pbind -e 0 ./msr_peek`
	//!
	//! this is written to be buildable as a standalone .rs. you can build it with cargo, if you want.
	//! but you can also `rustc -C opt-level=3 msr_peek.rs`.
	//!
	//! # results
	//!
	//! ## 3950x/linux
	//!
	//! [13:54:06] # iximeow:~> sudo taskset -c 0 ./msr_peek
	//! [sudo] password for iximeow:
	//! read 2000000 samples in 7.683337ms (3ns each)
	//! /!\ MSR 0x000010, TSC /!\
	//! read 2000000 samples in 435.246707ms (217ns each)
	//! changed by 3495367.75 per ms (total diff: 1520484945)
	//! averaged 1.00 consecutive read per change, for 217ns per change
	//! average change per tick: 760.24
	//! /!\ MSR 0xc001029a, RAPL Core Power /!\
	//! read 2000000 samples in 484.547539ms (242ns each)
	//! changed by 510.72 per ms (total diff: 247189)
	//! averaged 4123.71 consecutive reads per change, for 997766ns per change
	//! average change per tick: 510.72
	//! /!\ MSR 0xc001029b, RAPL Package Power /!\
	//! read 2000000 samples in 558.854677ms (279ns each)
	//! changed by 3413.75 per ms (total diff: 1904871)
	//! averaged 3577.82 consecutive reads per change, for 997983ns per change
	//! average change per tick: 3413.75
	//!
	//! ## 7950x/illumos
	//!
	//! root@helios:/rpool/devel/oxide/msr_peek# pbind -e 0 ./target/release/msr_peek
	//! read 2000000 samples in 8.852352ms (4ns each)
	//! /!\ MSR 0x000010, TSC /!\
	//! read 2000000 samples in 716.010781ms (358ns each)
	//! changed by 4499621.50 per ms (total diff: 3221729010)
	//! averaged 1.00 consecutive read per change, for 358ns per change
	//! average change per tick: 1610.87
	//! /!\ MSR 0xc001029a, RAPL Core Power /!\
	//! read 2000000 samples in 939.279162ms (469ns each)
	//! changed by 10083.45 per ms (total diff: 9468358)
	//! averaged 2127.66 consecutive reads per change, for 997563ns per change
	//! average change per tick: 10083.45
	//! /!\ MSR 0xc001029b, RAPL Package Power /!\
	//! read 2000000 samples in 851.845292ms (425ns each)
	//! changed by 3044.61 per ms (total diff: 2590965)
	//! averaged 2344.67 consecutive reads per change, for 996200ns per change
	//! average change per tick: 3041.04
	//!
	//! ## 7713p/illumos
	//!
	//! # pbind -e 0 ./msr_peek
	//! read 2000000 samples in 11.602298ms (5ns each)
	//! /!\ MSR 0x000010, TSC /!\
	//! read 2000000 samples in 942.356169ms (471ns each)
	//! changed by 1996969.00 per ms (total diff: 1881144860)
	//! averaged 1.00 consecutive read per change, for 471ns per change
	//! average change per tick: 940.57
	//! /!\ MSR 0xc001029a, RAPL Core Power /!\
	//! read 2000000 samples in 1.040690635s (520ns each)
	//! changed by 209.42 per ms (total diff: 217794)
	//! averaged 1924.93 consecutive reads per change, for 1000480ns per change
	//! average change per tick: 209.82
	//! /!\ MSR 0xc001029b, RAPL Package Power /!\
	//! read 2000000 samples in 1.167764682s (583ns each)
	//! changed by 5661.90 per ms (total diff: 6607439)
	//! averaged 1715.27 consecutive reads per change, for 999845ns per change
	//! average change per tick: 5671.62
	//!
	//! # remarks
	//!
	//! TSC counts at just about the core frequency. no surprise here. TSC updates on every read,
	//! that's minorly interesting!
	//!
	//! in both cases reads go though some layers of abstraction to actually issue an `rdmsr`, so the
	//! floor here is not reflective of the hardware's responsiveness to `rdmsr`. it's a syscall,
	//! virtual filesystem translation, indirect calls therein, an rdmsr, then any syscall exit
	//! overhead w.r.t uarch state flushing for the kernel/user boundary. a TSC read is pretty fast
	//! though, and changes between that and other MSRs are probably actually due to the MSR choice.
	//!
	//! yes, on a 7950x the package power MSR is faster to read than the core power MSR. surprise to
	//! me!
	//!
	//! RAPL registers update about once a millisecond. this is consistent with advice that reads of
	//! RAPL registers must ensure that measurements are taken more than 1ms apart so as to be
	//! meaningful.
	//!
	//! RAPL registers count in units of 1/65536 joules. the core being used here is under ~100% load,
	//! so one can expect it to be in its highest power state. RAPL-reported draw on:
	//! * 3950x: 7.79W core/52.1W package. this system was otherwise busy, other cores active, etc.
	//! * 7950x: 153.86W core/46.4W package. this system was otherwise idle.
	//! - note core and package power are measured in separate steps. this measurement suggests
	//! that measuring package power momentarily interferes with turbo, and is preventing the
	//! busy core from being kept in its highest-power state.
	//! * 7713p: 3.2W core/86.5W package. this system was otherwise idle.
	//! - server cores don't get clocked terribly high even under load. not too surprising here.

	use std::fmt::Write;
	use std::os::fd::IntoRawFd;
	use std::os::fd::RawFd;
	use std::time::Duration;

	// 2M samples. reading in a tight loop this might take a second or two if we're looking at a full
	// microsecond per read. it's an ioctl though, so, i guess we'll see what we see.
	const SAMPLE_COUNT: usize = 1000 * 1000 * 2;

	#[derive(Copy, Clone)]
	struct MsrDesc {
	msr_nr: u64,
	name: &'static str,
	}

	struct MsrCtx {
	msr_fd: RawFd,
	}

	impl MsrCtx {
	fn new() -> Self {
	#[cfg(target_os = "illumos")]
	{
	// happens to be "self" as time of query. if we get switched to another CPU, we'll be
	// checking someone else's MSRs with IPIs. sppoky!!! pbind to a single cpu for best
	// results.
	let cpu_file = std::fs::File::open("/dev/cpu/self/cpuid").expect("can open cpuid fd");
	return Self {
	msr_fd: cpu_file.into_raw_fd(),
	};
	};

	#[cfg(target_os = "linux")]
	{
	// arbitrarily pick cpu 0. run on CPU 0 for best results.
	let cpu_file = std::fs::File::open("/dev/cpu/0/msr").expect("can open msr fd");
	return Self {
	msr_fd: cpu_file.into_raw_fd(),
	};
	};

	#[cfg(not(any(target_os = "illumos", target_os = "linux")))]
	panic!("unsupported target OS");
	}

	#[inline(always)]
	fn rdmsr(&self, msr: u64) -> u64 {
	#[cfg(target_os = "illumos")]
	{
	// trusting the system can provide an ioctl here instead of depending on libc and
	// requiring cargo to build this file...
	extern "C" {
	// `request` is an `int` on illumos, `unsigned long` on linux.
	fn ioctl(
	fd: std::os::fd::RawFd,
	op: std::ffi::c_int,
	param: *mut std::ffi::c_void,
	) -> u64;
	}

	const CPUID_IOC: i32 =
	((b'c' as i32) << 24) \| ((b'i' as i32) << 16) \| ((b'd' as i32) << 8);
	const CPUID_RDMSR: i32 = CPUID_IOC \| 1;

	#[repr(C)]
	struct CpuidRdmsr {
	msr_nr: u64,
	msr_val: u64,
	}

	let mut crm: CpuidRdmsr = CpuidRdmsr {
	msr_nr: msr,
	msr_val: 0,
	};

	let res = unsafe {
	ioctl(
	self.msr_fd,
	CPUID_RDMSR,
	&mut crm as mut CpuidRdmsr as mut std::ffi::c_void,
	)
	};
	assert_eq!(res, 0);

	return crm.msr_val as u64;
	}
	#[cfg(target_os = "linux")]
	{
	extern "C" {
	fn pread(
	fd: std::os::fd::RawFd,
	buf: *mut std::ffi::c_void,
	count: isize,
	offset: u64,
	) -> isize;
	}

	let mut v: u64 = 0;

	let res = unsafe {
	pread(
	self.msr_fd,
	&mut v as mut u64 as mut [u8; 8] as *mut std::ffi::c_void,
	8,
	msr,
	)
	};
	assert_eq!(res, 8);

	return v;
	}

	#[cfg(not(any(target_os = "illumos", target_os = "linux")))]
	panic!("unsupported target OS");
	}
	}

	const TSC: MsrDesc = MsrDesc {
	msr_nr: 0x00000010,
	name: "TSC",
	};

	const CORE_POWER: MsrDesc = MsrDesc {
	msr_nr: 0xC001_029A,
	name: "RAPL Core Power",
	};

	const PACKAGE_POWER: MsrDesc = MsrDesc {
	msr_nr: 0xC001_029B,
	name: "RAPL Package Power",
	};

	fn sample_msr(msr_ctx: &MsrCtx, msr: MsrDesc, samples: usize) -> (Vec<u64>, Duration) {
	let mut buf = Vec::with_capacity(samples);

	let start = std::time::Instant::now();

	for _ in 0..samples {
	buf.push(msr_ctx.rdmsr(msr.msr_nr));
	}

	(buf, start.elapsed())
	}

	fn no_op(_msr_ctx: &MsrCtx, msr: MsrDesc, samples: usize) -> (Vec<u64>, Duration) {
	let mut buf = Vec::with_capacity(samples);

	let start = std::time::Instant::now();

	for _ in 0..samples {
	buf.push(msr.msr_nr);
	}

	(buf, start.elapsed())
	}

	// in case you want to write out all the samples from an MSR...
	#[allow(dead_code)]
	fn stringify(samples: Vec<u64>) -> String {
	let mut out = String::new();
	for sample in samples.iter() {
	writeln!(out, "{}", sample).expect("works");
	}
	out
	}

	fn show_stats(msr: MsrDesc, samples: Vec<u64>, duration: Duration) {
	println!("/!\\ MSR {:#08x}, {} /!\\", msr.msr_nr, msr.name);
	let time_per_sample = duration / samples.len() as u32;
	println!(
	"read {} samples in {:?} ({:?} each)",
	samples.len(),
	duration,
	time_per_sample
	);
	let first_last_diff = samples[samples.len() - 1] - samples[0];
	let diff_per_ms = (first_last_diff as f32) / (duration.as_millis() as f32);
	println!(
	"changed by {:.2} per ms (total diff: {})",
	diff_per_ms, first_last_diff
	);
	let min = samples.iter().min().expect("has a min");
	let max = samples.iter().max().expect("has a max");
	if max - min != first_last_diff {
	eprintln!("! first was not min or last was not max (wraparound while running?)");
	}

	// consecutive reads may return the same value. some MSRs note that reads must be spaced more
	// than 1ms apart to compute us / reads_per_tick.len()ful deltas. validate that here.
	let mut reads_per_tick = Vec::new();
	let mut diff_per_tick = Vec::new();
	let mut sample_iter = samples.iter();
	let mut prev = sample_iter.next().expect("has a sample");
	let mut reads_this_tick = 1;

	for sample in sample_iter {
	if sample == prev {
	reads_this_tick += 1;
	} else {
	if sample > prev {
	diff_per_tick.push((sample - prev) as f64);
	}
	reads_per_tick.push(reads_this_tick as f64);
	reads_this_tick = 1;
	}

	prev = sample;
	}
	reads_per_tick.push(reads_this_tick as f64);
	let avg_reads_per_tick = reads_per_tick.iter().sum::<f64>() / reads_per_tick.len() as f64;
	println!(
	"averaged {:0.2} consecutive {} per change, for {:?}ns per change",
	avg_reads_per_tick,
	if avg_reads_per_tick == 1.0 {
	"read"
	} else {
	"reads"
	},
	(time_per_sample * avg_reads_per_tick as u32).as_nanos()
	);
	println!(
	"average change per tick: {:0.2}",
	diff_per_tick.iter().sum::<f64>() / diff_per_tick.len() as f64
	);
	}

	pub fn main() {
	let msr_ctx = MsrCtx::new();
	let (no_op_samples, no_op_duration) = no_op(&msr_ctx, TSC, SAMPLE_COUNT);
	println!(
	"read {} samples in {:?} ({:?} each)",
	no_op_samples.len(),
	no_op_duration,
	no_op_duration / no_op_samples.len() as u32
	);
	let (tsc_samples, tsc_duration) = sample_msr(&msr_ctx, TSC, SAMPLE_COUNT);
	show_stats(TSC, tsc_samples, tsc_duration);
	let (samples, duration) = sample_msr(&msr_ctx, CORE_POWER, SAMPLE_COUNT);
	show_stats(CORE_POWER, samples, duration);
	let (samples, duration) = sample_msr(&msr_ctx, PACKAGE_POWER, SAMPLE_COUNT);
	show_stats(PACKAGE_POWER, samples, duration);
	}