/******************************************************************
 * CS61C                       CACHE LAB                          *
 * Using this program, on as many different kinds of computers as *
 * possible, investigate these cache parameters:                  *
 *   -- total cache size                                          *
 *   -- cache width                                               *
 *   -- cache replacement policy                                  *
 *                                                                *
 * Make sure you compile this code without optimizations on       *
 * otherwise, the overhead calculations will not be correct       *
 *                                                                *
 * ChangeLog:                                                     *
 *   2004.03.20:                                                  *
 *   Heavily rewritten by Jeremy Huddleston <jeremyhu@cory>       *
 *     Changed CACHE* names to ARRAY* names                       *
 *     Using (1 << X) notation for sizes of 2^X                   *            
 *     #ifndef around CLK_TCK since it's set by POSIX headers     *
 *     broke up main() into sub functions                         *
 *     do everything except output using clock ticks              *
 ******************************************************************/

#include <stdio.h>
#include <sys/times.h>
#include <sys/types.h>
#include <time.h>

/* You can undefine this if you want to check the r or w speeds, but it needs some
 * tweaking to fix the averaging...
 */
#define RW_ONLY

/* Note that (1 << X) is 2^X ... the MIN/MAX macros below MUST be powers of 2. */
#define ARRAY_MIN (1 << 10)  /* smallest cache (in words) */
#define ARRAY_MAX (1 << 20)  /* largest cache */
#define STRIDE_MIN (1 << 0)  /* smallest stride (in words) */
#define STRIDE_MAX (1 << 10) /* largest stride */
#define SAMPLE 10000000      /* Approximately how many accesses to the array do we want
                              * to make in our analysis.  Larger generates better
                              * statistics.  Adjust for speed on your system.  This
                              * number is by no means exact.  We always do at-least this
                              */

#ifndef CLK_TCK              /* POSIX systems already provide this */
#define CLK_TCK 100          /* number clock ticks per micro-second (ie. the MHz of the core)*/
#endif

/* This macro creates a function which iterates through the passed array of size asize striding
 * by stride.  We return the number of clock ticks it takes to perform the operation.
 */

#define ARRAY_ACCESS_FUN(func_name, vars, operation) \
static unsigned func_name(unsigned array[], unsigned asize, unsigned stride, unsigned steps) { \
	vars; \
	unsigned register i, j; \
	unsigned utime; \
	struct tms rusage; \
\
	/* Initialize the time count */ \
	times (&rusage); \
	utime = rusage.tms_utime; \
\
	/* Iterate through our array */ \
	for(j=0; j < steps; j++) { \
		for(i=0; i < asize; i += stride) { \
			operation; \
		} \
	} \
\
	/* Calculate the final time */ \
	times (&rusage);\
	return (rusage.tms_utime - utime); \
}

/* Make some functions using that macro */
ARRAY_ACCESS_FUN(rArray, unsigned register temp, temp = array[i])
ARRAY_ACCESS_FUN(rArrayOverhead, unsigned register temp, temp = i)

ARRAY_ACCESS_FUN(wArray, , array[i] = i)
ARRAY_ACCESS_FUN(wArrayOverhead, , )

ARRAY_ACCESS_FUN(rwArray, , array[i] = array[i])
ARRAY_ACCESS_FUN(rwArrayOverhead, , )

/* array going to stride through */
unsigned array[ARRAY_MAX];

/* And... begin... */
int main () {
	int asize, stride, step, steps, accesses;
	unsigned rtime, wtime, rwtime;
	double rsec, wsec, rwsec;

	printf("CLK_TCK: %d\n", CLK_TCK);

	for (asize = ARRAY_MIN; asize <= ARRAY_MAX; asize = asize << 1) {
		/* Print an extra \n here to separate the groups with the same array size */
		printf("\n");

		for (stride = STRIDE_MIN; stride <= STRIDE_MAX && stride <= asize; stride = stride << 1) {
			/* How many total accesses do we do on each iteration */
			accesses = asize / stride;
			steps = 1 + ( SAMPLE / accesses );
#ifndef RW_ONLY
			/**************** FIRST READ ****************/

			rtime = 0;
			/* Do it once to initialize the cache before we start timing */
			rArray(array, asize, stride, 1);

			/* We want to average the overhead too... */
			rtime += rArray(array, asize, stride, steps) - rArrayOverhead(array, asize, stride, steps);

			/* Calculate our time */
			rsec = (double) rtime * 1e9 / ((double)CLK_TCK * steps * accesses);

			/**************** NOW WRITE ****************/

			wtime = 0;
			/* Do it once to initialize the cache before we start timing */
			wArray(array, asize, stride, 1);

			/* We want to average the overhead too... */
			wtime += wArray(array, asize, stride, steps) - wArrayOverhead(array, asize, stride, steps);

			/* Calculate our time */
			wsec = (double) rwtime * 1e9 / ((double)CLK_TCK * steps * accesses);

#endif
			/**************** NOW READ/WRITE ****************/

			rwtime = 0;
			/* Do it once to initialize the cache before we start timing */
			rwArray(array, asize, stride, 1);

			/* We want to average the overhead too... */
			rwtime += rwArray(array, asize, stride, steps) - rwArrayOverhead(array, asize, stride, steps);

			/* Calculate our time */
			rwsec = (double) rwtime * 1e9 / ((double)CLK_TCK * steps * accesses);

			/**************** PRINT RESULTS ****************/

#ifdef RW_ONLY
			printf("Size (bytes): %7d Stride (bytes): %4d read+write: %4.0f ns\n",
			       asize * sizeof (int),
			       stride * sizeof (int),
			       rwsec);
#else
			printf("Size (bytes): %7d Stride (bytes): %4d read: %4.0f ns write: %4.0f ns read+write: %4.0f ns\n",
			       asize * sizeof (int),
			       stride * sizeof (int),
			       rsec,
			       wsec,
			       rwsec);
#endif
		}
	}
}
