Skip to content

Instantly share code, notes, and snippets.

@wolfspider
Last active July 30, 2018 05:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wolfspider/33174be531b51287fbfb3b084baf766c to your computer and use it in GitHub Desktop.
Save wolfspider/33174be531b51287fbfb3b084baf766c to your computer and use it in GitHub Desktop.
Proc Mapping to Native Function Notes for FoundationDB FreeBSD

Notes for mapping against proc in linprocfs to native functions in FreeBSD

Important Sections Duplicated From Linux

file: /foundationdb/flow/platform.cpp

line 262: GetResidentMemoryUsage

Gets the current rssize for the current process

line 318: GetMemoryUsage

Gets the current vmsize for the current process

  • /proc/self/statm : line 266 & line 322

Example FreeBSD Code

#include <iostream>
#include <sys/types.h>
#include <sys/user.h>
#include <sys/sysctl.h>
#include <sys/param.h>
#include <libutil.h>
#include <unistd.h>

using namespace std;

int main(int argc, char **argv) 
{ 

    pid_t ppid = getpid();
    int pidinfo[4];
    pidinfo[0] = CTL_KERN;
    pidinfo[1] = KERN_PROC;
    pidinfo[2] = KERN_PROC_PID;
    pidinfo[3] = (int)ppid;
    
    struct kinfo_proc procstk;
    size_t len = sizeof(procstk);
    
    sysctl(pidinfo, nitems(pidinfo), &procstk, &len, NULL, 0);
    
    printf("name: %s\n", procstk.ki_comm);
    //resident size
    printf("rssize: %ju\n", (uint64_t)procstk.ki_rssize);
    //virtual size
    printf("size: %ju\n", (uint64_t)procstk.ki_size >> PAGE_SHIFT);
    
    return 0; 
}

line 399: getMachineRAMInfo()

  • /proc/zoneinfo : line 412

  • /proc/meminfo : line 423

Gets the current RAM totals for the machine

Example FreeBSD Code

int status;
u_int page_size;
u_int free_count;
u_int active_count;
u_int inactive_count;
u_int wire_count;

size_t uint_size;
uint_size = sizeof(page_size);

status = sysctlbyname("vm.stats.vm.v_page_size", &page_size, &uint_size, NULL, 0);
if (status < 0){
TraceEvent(SevError, "GetMachineMemInfo").GetLastError();
throw platform_error();
}

status = sysctlbyname("vm.stats.vm.v_free_count", &free_count, &uint_size, NULL, 0);
if (status < 0){
TraceEvent(SevError, "GetMachineMemInfo").GetLastError();
throw platform_error();
}

status = sysctlbyname("vm.stats.vm.v_active_count", &active_count, &uint_size, NULL, 0);
if (status < 0){
TraceEvent(SevError, "GetMachineMemInfo").GetLastError();
throw platform_error();
}

status = sysctlbyname("vm.stats.vm.v_inactive_count", &inactive_count, &uint_size, NULL, 0);
if (status < 0){
TraceEvent(SevError, "GetMachineMemInfo").GetLastError();
throw platform_error();
}

status = sysctlbyname("vm.stats.vm.v_wire_count", &wire_count, &uint_size, NULL, 0);
if (status < 0){
TraceEvent(SevError, "GetMachineMemInfo").GetLastError();
throw platform_error();
}

memInfo.total = (int64_t)((free_count + active_count + inactive_count + wire_count) * (u_int64_t)(page_size / 1024));
memInfo.available = (int64_t)(free_count * (u_int64_t)(page_size / 1024));
memInfo.committed = memInfo.total - memInfo.available;

line 633: GetNetworkTraffic

  • /proc/net/dev : line 649

We have an example for /proc/net/dev here: http://www.leidinger.net/freebsd/dox/linux/html/d0/d56/linprocfs_8c.html#a7daba0127930a3bfc748a4bb6db64671

Decided to use netstat for inspiration at /freebsd/usr.bin/netstat/if.c line 519: "fill_iftot(struct iftot *st)"

In this example the parse IP function is there to recreate the "unixish" platform def for this build that is shared between OS's. It will of course not be going into the next commit :) just an example.

Example FreeBSD Code

//these are additional includes...like a diff
#include <netinet/in.h>

/* getifaddrs */
#include <ifaddrs.h>
#include <arpa/inet.h>
#include <net/if.h>

const char* getInterfaceName(uint32_t _ip) {

static char iname[20];

struct ifaddrs* interfaces = NULL;
const char* ifa_name = NULL;

if (getifaddrs(&interfaces)) {
	throw;
}

for (struct ifaddrs* iter = interfaces; iter; iter = iter->ifa_next) {
	if(!iter->ifa_addr)
		continue;
	if (iter->ifa_addr->sa_family == AF_INET) {
		uint32_t ip = ntohl(((struct sockaddr_in*)iter->ifa_addr)->sin_addr.s_addr);
		if (ip == _ip) {
			ifa_name = iter->ifa_name;
			break;
		}
	}
}

if (ifa_name) {
	strncpy(iname, ifa_name, 19);
	iname[19] = 0;
}

freeifaddrs(interfaces);

if (ifa_name)
	return iname;
else
	return NULL;
}

uint32_t parseIPV4string(char const * ipAddress)
{
    unsigned int ip[4];

    if ( 4 != sscanf(ipAddress, "%u.%u.%u.%u", &ip[0], &ip[1], &ip[2], &ip[3]) )
	 return 0;   // or some other indicator or error

    return ip[3] + ip[2] * 0x100 + ip[1] * 0x10000ul + ip[0] * 0x1000000ul;
}
//hardwired to emulate function
uint32_t ip = parseIPV4string("192.168.64.2");

uint64_t	ift_ip;			/* input packets */
uint64_t	ift_ie;			/* input errors */
uint64_t	ift_id;			/* input drops */
uint64_t	ift_op;			/* output packets */
uint64_t	ift_oe;			/* output errors */
uint64_t	ift_od;			/* output drops */
uint64_t	ift_co;			/* collisions */
uint64_t	ift_ib;			/* input bytes */
uint64_t	ift_ob;			/* output bytes */

const char* ifa_getname = nullptr;
try {
	ifa_getname = getInterfaceName(ip);
}
catch(exception &e) {
	
	throw;
}

printf("interface: %s\n", ifa_getname);

struct ifnet *ifp;
struct ifaddrs* interfaces = NULL;
struct iftot *st;

if (getifaddrs(&interfaces)) {
	throw;
}

for (struct ifaddrs* iter = interfaces; iter; iter = iter->ifa_next) {
	if(!iter->ifa_addr)
		continue;
if (strcmp(iter->ifa_name, ifa_getname) == 0 && iter->ifa_addr->sa_family == AF_LINK) {

	printf("we got info...\n");   

	ift_ip += ((struct if_data *)iter->ifa_data)->ifi_ipackets;     
	ift_ie += ((struct if_data *)iter->ifa_data)->ifi_ierrors;
	ift_id += ((struct if_data *)iter->ifa_data)->ifi_iqdrops;
	ift_ib += ((struct if_data *)iter->ifa_data)->ifi_ibytes;
	ift_op += ((struct if_data *)iter->ifa_data)->ifi_opackets;
	ift_oe += ((struct if_data *)iter->ifa_data)->ifi_oerrors;
	ift_od += ((struct if_data *)iter->ifa_data)->ifi_oqdrops;
	ift_ob += ((struct if_data *)iter->ifa_data)->ifi_obytes;
	ift_co += ((struct if_data *)iter->ifa_data)->ifi_collisions;       

	}
}
freeifaddrs(interfaces);

/* packets received on interface */

printf("packets received: %ju\n", ift_ip);

Ok, this seems accurate to netstat lets do the same thing except allocate this like ifstat instead!

Example FreeBSD Code

int if_count;
int i;
int mib[6];
size_t ifmiblen;
struct ifmibdata ifmd;

mib[0] = CTL_NET;
mib[1] = PF_LINK;
mib[2] = NETLINK_GENERIC;
mib[3] = IFMIB_IFDATA;
mib[4] = IFMIB_IFCOUNT;
mib[5] = IFDATA_GENERAL;

ifmiblen = sizeof(ifmd);

for (i = 1; i <= if_count; i++) {
	mib[4] = i; //row
	
	sysctl(mib, 6, &ifmd, &ifmiblen, (void *)0, 0);	
	
	if (!strcmp(ifmd.ifmd_name, ifa_getname)) {
		printf("bytes sent: %ju\n", ifmd.ifmd_data.ifi_obytes);
		printf("bytes received: %ju\n", ifmd.ifmd_data.ifi_ibytes);
		printf("total packets sent: %ju\n", ifmd.ifmd_data.ifi_opackets);
	}
}
  • /proc/net/snmp : line 683

Gets the current network traffic statistics for active open connections, reset connections, etc.. https://unix.stackexchange.com/questions/435579/is-there-documentation-for-proc-net-netstat-and-proc-net-snmp

For this we need total segments out and retransmitted segments and the equivalent for Darwin (in FreeBSD) total packets out is:

ift_op += ((struct if_data *)iter->ifa_data)->ifi_opackets;

Linux seems to analyze the entire TCP which is not adapter specific:

std::ifstream snmp_stream("/proc/net/snmp", std::ifstream::in);

std::string label;

while (snmp_stream.good()) {
	snmp_stream >> label;
	snmp_stream.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
	if (label == "Tcp:")
		break;
}

/* Ignore the first 11 columns of the Tcp line */
for (int i = 0; i < 11; i++)
	snmp_stream >> ignore;

snmp_stream >> outSegs;
snmp_stream >> retransSegs;

For retransmitted segments netstat does a sysctlbyname to: "net.inet.tcp.stats" in order to retrieve the value of: tcps_sndrexmitpack found @ line 525: /freebsd/usr.bin/netstat/inet.c

Maybe gather total out segments via "net.inet.tcp.stats" for FreeBSD then?

Example FreeBSD Code

struct tcpstat tcpstat;
size_t stat_len;
stat_len = sizeof(tcpstat);
int tcpstatus = sysctlbyname("net.inet.tcp.stats", &tcpstat, &stat_len, NULL, 0);

printf("total packets sent: %ju\n", tcpstat.tcps_sndtotal);
printf("total packets retransmitted: %ju\n", tcpstat.tcps_sndrexmitpack); 

So, we will get the interface total packets sent but this is an interesting question which one should we use? Based on comments from the official FoundationDB forum: https://forums.foundationdb.org/t/freebsd-support-for-foundationdb/357/11 we are going to go with tcpstat. There is a lot of good stuff in there for customizations later on.

line 702: GetMachineLoad

  • /proc/stat: line 704

(in the comments: // Even though this function doesn't throw errors, the equivalents for other platforms do, and since all of our simulation testing is on Linux... )

stats for I/O, idle, wait, interrupt requests..similar to top or activity monitor

std::ifstream stat_stream("/proc/stat", std::ifstream::in);

std::string ignore;
stat_stream >> ignore;

uint64_t t_user, t_nice, t_system, t_idle, t_iowait, t_irq, t_softirq, t_steal, t_guest;
stat_stream >> t_user >> t_nice >> t_system >> t_idle >> t_iowait >> t_irq >> t_softirq >> t_steal >> t_guest;

totalTime = t_user+t_nice+t_system+t_idle+t_iowait+t_irq+t_softirq+t_steal+t_guest;
idleTime = t_idle+t_iowait;

On FreeBSD systat -vmstat is where its at.

Example FreeBSD Code

long cur[CPUSTATES], last[CPUSTATES];
    size_t cur_sz = sizeof cur;
    int state, j;
    long sum;
    double util;

    memset(last, 0, sizeof last);

    for (j=0; j<60; j++)
    {
            if (sysctlbyname("kern.cp_time", &cur, &cur_sz, NULL, 0) < 0)
            {
                    printf ("Error reading kern.cp_times sysctl\n");
                    return -1;
            }

            sum = 0;
            for (state = 0; state<CPUSTATES; state++)
            {
                    long tmp = cur[state];
                    cur[state] -= last[state];
                    last[state] = tmp;
                    sum += cur[state];
            }

            	util = (100ULL * cur[CP_USER] / (sum ? (double) sum : 1.0L));
            	printf("cpu user utilization: %7.3f\n", util);
		util = (100ULL * cur[CP_NICE] / (sum ? (double) sum : 1.0L));
            	printf("cpu nice utilization: %7.3f\n", util);
		util = (100ULL * cur[CP_SYS] / (sum ? (double) sum : 1.0L));
            	printf("cpu sys utilization: %7.3f\n", util);
		util = (100ULL * cur[CP_IDLE] / (sum ? (double) sum : 1.0L));
            	printf("cpu idle utilization: %7.3f\n", util);
            	sleep(1);
    }

This took much longer than it should of and is mostly an answer to a SO question with the formula changed. https://stackoverflow.com/questions/5329149/using-system-calls-from-c-how-do-i-get-the-utilization-of-the-cpus

ok, we'll get this looking better but has all the info that Darwin has at the moment- time to move on.

line 719: GetDiskStatistics

  • /proc/diskstats : line 729

This one seems a little more complex but according to iostat.c these statistics can be retrieved since boot time via devstat_compute_statistics(). An example can be found on line 836 of iostat.c and documentation for devstat can be found here: https://www.freebsd.org/cgi/man.cgi?query=devstat&apropos=0&sektion=3&manpath=FreeBSD+11.0-RELEASE&arch=default&format=html

This one promises userland access which has been problematic at times, it's easy on accident to want to use KERNEL defined functions but the seperation makes this more secure. HardenedBSD audits this more closely as well making sure to be extremely careful about CPU access and the like.

(in the comments:

		uint64_t rd_ios;	/* # of reads completed */
		//	    This is the total number of reads completed successfully.
		uint64_t rd_merges;	/* # of reads merged */
		//	    Reads and writes which are adjacent to each other may be merged for
		//	    efficiency.  Thus two 4K reads may become one 8K read before it is
		//	    ultimately handed to the disk, and so it will be counted (and queued)
		//	    as only one I/O.  This field lets you know how often this was done.

		uint64_t rd_sectors; /*# of sectors read */
		//	    This is the total number of sectors read successfully.

		uint64_t rd_ticks;	/* # of milliseconds spent reading */
		//	    This is the total number of milliseconds spent by all reads (as
		//	    measured from __make_request() to end_that_request_last()).

		uint64_t wr_ios;	/* # of writes completed */
		//	    This is the total number of writes completed successfully.

		uint64_t wr_merges;	/* # of writes merged */
		//	    Reads and writes which are adjacent to each other may be merged for
		//	    efficiency.  Thus two 4K reads may become one 8K read before it is
		//	    ultimately handed to the disk, and so it will be counted (and queued)
		//	    as only one I/O.  This field lets you know how often this was done.

		uint64_t wr_sectors; /* # of sectors written */
		//	    This is the total number of sectors written successfully.

		uint64_t wr_ticks;	/* # of milliseconds spent writing */
		//	    This is the total number of milliseconds spent by all writes (as
		//	    measured from __make_request() to end_that_request_last()).

		uint64_t cur_ios;	/* # of I/Os currently in progress */
		//	    The only field that should go to zero. Incremented as requests are
		//	    given to appropriate struct request_queue and decremented as they finish.

		uint64_t ticks;	/* # of milliseconds spent doing I/Os */
		//	    This field increases so long as field 9 is nonzero.

		uint64_t aveq;	/* weighted # of milliseconds spent doing I/Os */
		//	    This field is incremented at each I/O start, I/O completion, I/O
		//	    merge, or read of these stats by the number of I/Os in progress
		//	    (field 9) times the number of milliseconds spent doing I/O since the
		//	    last update of this field.  This can provide an easy measure of both
		//	    I/O completion time and the backlog that may be accumulating.

)

Example FreeBSD Code

static struct statinfo dscur;
double etime;
struct timespec ts;
static int num_devices;

kvm_t *kd = NULL;

etime = ts.tv_nsec * 1e-6;

int dn;
u_int64_t total_transfers_read, total_transfers_write;
u_int64_t total_blocks_read, total_blocks_write;
u_int64_t queue_len;
long double ms_per_transaction;

dscur.dinfo = (struct devinfo *)calloc(1, sizeof(struct devinfo));
if (dscur.dinfo == NULL)
	printf("calloc failed");

if (devstat_getdevs(kd, &dscur) == -1)
	printf("%s", devstat_errbuf);

num_devices = dscur.dinfo->numdevs;

for (dn = 0; dn < num_devices; dn++)
{
	int di;

	if (devstat_compute_statistics(&dscur.dinfo->devices[dn], NULL, etime,
								   DSM_MS_PER_TRANSACTION, &ms_per_transaction,
								   DSM_TOTAL_TRANSFERS_READ, &total_transfers_read,
								   DSM_TOTAL_TRANSFERS_WRITE, &total_transfers_write,
								   DSM_TOTAL_BLOCKS_READ, &total_blocks_read,
								   DSM_TOTAL_BLOCKS_WRITE, &total_blocks_write,
								   DSM_QUEUE_LENGTH, &queue_len,
								   DSM_NONE) != 0)
		printf("%s", devstat_errbuf);

	printf("ios: %ju\nbusy: %i\n", queue_len, (int)ms_per_transaction);

	printf("reads:%11.1Lf\nwrites:%11.1Lf\nread sectors:%11.1Lf\nwrite sectors:%11.1Lf\n",
		   (long double)total_transfers_read,
		   (long double)total_transfers_write,
		   (long double)total_blocks_read,
		   (long double)total_blocks_write);
}

Alright time to do some math! This was difficult to figure out but here is my reasoning...

Tick is described as a millisecond and we have structs for seconds and nanosecond. So we poll the devices with ts.tv_nsec * 1e-6. Nanosecond to Millisecond conversion is 1.0 * 1e-6 accordingly: http://extraconversion.com/time/nanoseconds/nanoseconds-to-milliseconds.html

Transfers are designated with the same verbage, FreeBSD defines sectors as "blocks". This holds true for TCP segments as well those are similarily called "blocks".

Current IOs are defined throughout the rest of the platform as:

	returnStats.processDiskQueueDepth = currentIOs;

Ok, so this must be the Queue Length value defined in FreeBSD's devstat. What about busyTicks?

	busyTicks = ticks;

So from what was defined earlier this value is: "# of milliseconds spent doing I/Os"

Since we are polling this per millisecond then we use "ms_per_transaction" in order to create a discreet ratio of: ms_per_transaction / 1 millisecond = # of milliseconds spent doing I/Os

Update From Review

looked for something simpler than this and found atop on FreeBSD uses ms_per_transaction / 1000 to calculate io/ms going to try this instead:

i->dsk.dsk[dn].io_ms = ms_per_transaction * 1000;

atop source

Looking to functional code from the scientific community let's use the time calculations from Criterion for Haskell

The Linux and FreeBSD timers are also defined this way:

struct timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
return double(ts.tv_sec) + (ts.tv_nsec * 1e-9);

etime theoretically should be then for CPUTime:

ts.tv_sec + ts.tv_nsec * 1e-9

Reading devstat more thoroughly we find that alloc to specific pointers are a delta between what is known as 'previous' and 'current':

The devstat_compute_statistics() function provide complete statistics calculation. There are four arguments for which values must be supplied: current, previous, etime, and the terminating argument for the varargs list, DSM_NONE. For most applications, the user will want to supply valid devstat structures for both current and previous. In some instances, for instance when calculating statistics since system boot, the user may pass in a NULL pointer for the previous argument. In that case, devstat_compute_statistics() will use the total stats in the current structure to calculate statistics over etime. For each statistics to be calculated, the user should supply the proper enumerated type (listed below), and a variable of the indicated type. All statistics are either integer values, for which a uint64_t is used, or floating point, for which a long double is used.

What this suggests is that we must allocate two sets of structs poll the devices and set the polling time to CPUTime. Comments in atop make reference to a necessity of polling twice in order to create an absolute value between 'previous' and 'current' for this to work.

  • Line 1875 & 1880: setaffinty

Leaving this section the same we are talking about routines which are 4 lines each. Attempted something tricky but was getting too close to the sun. Both routines only have two lines in common and apparently the compiler does not like slicing it up with conditional statements there. I think ~2-4 lines of code is the threshold for safe de-duplication.

Guess that's it. The deep dive into the kernel was enlightening now onto merging this in!

@wolfspider
Copy link
Author

wolfspider commented May 14, 2018

Quick screen for how to verify the code example for /proc/net/dev with netstat:

netverify

@wolfspider
Copy link
Author

@wolfspider
Copy link
Author

example of atop and metric "avio" calculates io/ms

atopscreen

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment