/*---------------------------------------------------------------------
 *        [ Copyright (c) 1999 Alpha Processor Inc.] - Unpublished Work
 *          All rights reserved
 * 
 *    This file contains source code written by Alpha Processor, Inc.
 *    It may not be used without express written permission. The
 *    expression of the information contained herein is protected under
 *    federal copyright laws as an unpublished work and all copying
 *    without permission is prohibited and may be subject to criminal
 *    and civil penalties. Alpha Processor, Inc.  assumes no
 *    responsibility for errors, omissions, or damages caused by the use
 *    of these programs or from use of the information contained herein.
 *  
 *-------------------------------------------------------------------*/
/* SMP-aware Memory test wrapper functions                            */
/* (c) 1999 Alpha Processor Inc.                                      */
/* Begun by Stig Telfer, API, 4 August 1999                           */

#include "lib.h"
#include "uilib.h"
#include "smp.h"

#include "mem.h"

#define QWDS_PER_CACHELINE 8		/* 8 64-bit quadwords per cache line */

/*--------------------------------------------------------------------*/
/* Test driver function prototypes
 *
 * The SMP behaviour of these memory tests regarding division of labour
 * does vary.  See the individual routines for detailed information
 */

static size_t uniq_wr( SMP_ptr start, size_t bytes );
static size_t uniq_rd( SMP_ptr start, size_t bytes );

static size_t energy_wr( SMP_ptr start, size_t bytes );
static size_t energy_rd( SMP_ptr start, size_t bytes );

static size_t rnddata_wr( SMP_ptr start, size_t bytes );
static size_t rnddata_rd( SMP_ptr start, size_t bytes );

static size_t dpattern_wr( SMP_ptr start, size_t bytes );
static size_t dpattern_rd( SMP_ptr start, size_t bytes );

static size_t icache_wr( SMP_ptr start, size_t bytes );
static size_t icache_rd( SMP_ptr start, size_t bytes );

static size_t gc_wr( SMP_ptr start, size_t bytes );
static size_t gc_rd( SMP_ptr start, size_t bytes );

static size_t strided_wr( SMP_ptr start, size_t bytes );
static size_t strided_rd( SMP_ptr start, size_t bytes );

static size_t cachexch_wr( SMP_ptr start, size_t bytes );
static size_t cachexch_rd( SMP_ptr start, size_t bytes );


/* Used by several of the test functions */
extern size_t dpattern_asm( SMP_ptr start, size_t bytes, SMP_ptr buf );


const tst_t tst[] = {
	{ "Cacheline Exchange",	cachexch_wr,	cachexch_rd },
	{ "Strided access",	strided_wr,	strided_rd },
        { "Istream/Icache",	icache_wr,      icache_rd },
	{ "Gray code data",	gc_wr, 		gc_rd },
        { "Address uniqueness",	uniq_wr,	uniq_rd    },
        { "Bus transitions",	energy_wr,	energy_rd  },
        { "Random patterns",	dpattern_wr,    dpattern_rd },

	/* these tests are currently unused */
        { "Random data",        rnddata_wr,     rnddata_rd },
};

/* const unsigned ntests = sizeof( tst ) / sizeof( tst_t ); */
const unsigned ntests = 7;


/*--------------------------------------------------------------------*/
/* Address Uniqueness Test - sets memory contents to address of mem */

extern size_t uniq_wr_asm( SMP_ptr start, size_t bytes );

static size_t uniq_wr( SMP_ptr start, size_t bytes )
{
    bytes /= smp_ncpus();			/* split the work */
    start = (SMP_ptr)( (char *)start + bytes*smp_myid() );

    return uniq_wr_asm( start, bytes );
}

static size_t uniq_rd( SMP_ptr start, size_t bytes )
{
    register SMP_ptr end, ptr;
    register uint64 sum;
    register uint64 r0, r1, r2, r3, r4, r5, r6, r7;
    SMP_ptr exp[8];
    unsigned ncpus = smp_ncpus();

    bytes /= ncpus;			/* split the work */
    start = (SMP_ptr)( (char *)start + bytes*smp_myid() );
    end   = (SMP_ptr)( (char *)start + bytes);

    for (ptr = start; ptr < end; )
    {
        r0 = *ptr - (uint64)ptr;         ptr++;
        r1 = *ptr - (uint64)ptr;         ptr++;
        r2 = *ptr - (uint64)ptr;         ptr++;
        r3 = *ptr - (uint64)ptr;         ptr++;
        r4 = *ptr - (uint64)ptr;         ptr++;
        r5 = *ptr - (uint64)ptr;         ptr++;
        r6 = *ptr - (uint64)ptr;         ptr++;
        r7 = *ptr - (uint64)ptr;         ptr++;

        sum = r0 | r1 | r2 | r3 | r4 | r5 | r6 | r7;

        if ( sum == 0 )                         continue;

        /* error condition, not properly handled at present */
        exp[0] = ptr - 8, exp[1] = ptr - 7, exp[2] = ptr - 6, exp[3] = ptr - 5;
        exp[4] = ptr - 4, exp[5] = ptr - 3, exp[6] = ptr - 2, exp[7] = ptr - 1;

        dump_cacheline( "Unique-Addr", ptr - 8, (uint64 *)exp );
    }
    return bytes;		/* return actual number of bytes read */
}


/*--------------------------------------------------------------------*/
/* Alternate 0,F Test - causes as much transition on the bus as possible */

extern size_t energy_wr_asm( SMP_ptr start, size_t bytes );

static size_t energy_wr( SMP_ptr start, size_t bytes )
{
    bytes /= smp_ncpus();			/* split the work */
    start = (SMP_ptr)( (char *)start + bytes*smp_myid() );

    return energy_wr_asm( start, bytes );
}


static size_t energy_rd( SMP_ptr start, size_t bytes )
{
    register SMP_ptr end = (SMP_ptr)((char *)start + bytes);
    register SMP_ptr ptr;
    register uint64 r0, r1, r2, r3, r4, r5, r6, r7;
    register uint64 sum;
    static uint64 exp[8] = {
	0UL,
	0UL,
	0xFFFFFFFFFFFFFFFFUL,
	0xFFFFFFFFFFFFFFFFUL,
        0x5555555555555555UL,
	0xAAAAAAAAAAAAAAAAUL,
	0xAAAAAAAAAAAAAAAAUL,
	0x5555555555555555UL };

    bytes /= smp_ncpus();
    start = (SMP_ptr)( (char *)start + bytes*smp_myid() );
    end   = (SMP_ptr)( (char *)start + bytes );

    for (ptr = start; ptr < end; )
    {
        r0 = *ptr++ - exp[0];
        r1 = *ptr++ - exp[1];
        r2 = *ptr++ - exp[2];
        r3 = *ptr++ - exp[3];
        r4 = *ptr++ - exp[4];
        r5 = *ptr++ - exp[5];
        r6 = *ptr++ - exp[6];
        r7 = *ptr++ - exp[7];

        sum = r0 | r1 | r2 | r3 | r4 | r5 | r6 | r7;

        if ( sum == 0 )                 continue;

        /* error condition, not handled properly at present */
        dump_cacheline( "Bus transitions", ptr - 8, exp );

    }
    return bytes;		/* actual number of bytes read */
}


/*--------------------------------------------------------------------*/
/* Random Data Test */
/* NOTE: SMP wont work yet - random number generator is not MT-safe :-( */

static unsigned seed;

static size_t rnddata_wr( SMP_ptr start, size_t bytes )
{
    register SMP_ptr end = (SMP_ptr)((char *)start + bytes);
    SMP_ptr ptr;
    uint64 wval;

    seed = rpcc();                              /* pseudo-random input */
    srandom( seed );

    for (ptr = start; ptr < end; ptr++)         /* fill */
    {
        /* build a 64-bit pseudorandom value */
        wval = qrandom();
        *ptr = wval;
    }
    return bytes;			/* actual number of bytes written */
}


static size_t rnddata_rd( SMP_ptr start, size_t bytes )
{
    register SMP_ptr end = (SMP_ptr)((char *)start + bytes);
    SMP_ptr ptr;
    uint64 rval, expect;

    srandom( seed );            /* reproduce same pseudorandom sequence */
    for (ptr = start; ptr < end; ptr++)
    {
        expect = qrandom();
        rval = *ptr;

        if ( rval == expect ) continue;       /* read back ok */

        /* error condition */
        mem_logerr("Random_Data", "corruption", ptr, rval, expect);
    }
    return bytes;			/* actual number of bytes read */
}


/*----------------------------------------------------------------------*/
/* data-pattern test - random repeated cacheline datapattern */
/* in SMP mode, this test divides memory into equal regions */
/* Note: for SMP mode, we need to regenerate the dpattern test vector
 * consistently...
 */

static uint64 dpattern[MAX_CPUS][8];

static size_t dpattern_wr( SMP_ptr start, size_t bytes )
{
    int i;
    unsigned long seed = rpcc();
    unsigned myid = smp_myid();

    bytes /= smp_ncpus();
    start = (SMP_ptr)( (char *)start + bytes*myid );


    /* -------- Mutual exclusion -------- */
    /* random number generator is not MT-safe :-( */

    smp_acquire( &drandom_mutex );

    srandom( seed );

    for ( i = 0; i < 8; i++ )
    {
        dpattern[myid][i] = qrandom();
    }

    smp_release( &drandom_mutex );
    /* -------- End of mutual exclusion -------- */

    /* now call the asm to blast that into memory */
    return dpattern_asm( start, bytes, dpattern[myid] );
}

static size_t dpattern_rd( SMP_ptr start, size_t bytes )
{
    register SMP_ptr end;
    register SMP_ptr ptr;
    register uint64 val0, val1, val2, val3, val4, val5, val6, val7;
    register uint64 *exp;
    uint64 sum;
    unsigned myid = smp_myid();

    bytes /= smp_ncpus();
    start = (SMP_ptr)( (char *)start + bytes*myid );
    end = (SMP_ptr)( (char *)start + bytes );

    exp = dpattern[myid];

    for (ptr = start, sum = 0; ptr < end; )
    {
        /* read a cacheline per loop iteration */
        val0 = *ptr++ - exp[0];
        val1 = *ptr++ - exp[1];
        val2 = *ptr++ - exp[2];
        val3 = *ptr++ - exp[3];
        val4 = *ptr++ - exp[4];
        val5 = *ptr++ - exp[5];
        val6 = *ptr++ - exp[6];
        val7 = *ptr++ - exp[7];

        /* look for discrepancies with as few conditionals as possible */
        sum = val0 | val1 | val2 | val3 | val4 | val5 | val6 | val7;

        if ( sum == 0 ) continue;       /* read back ok */

        /* error condition - dump entire cache line */
        dump_cacheline( "Dpattern", ptr - 8, exp );
    }
    return bytes;
}


/*----------------------------------------------------------------------*/
/* Gray Code data pattern test - generate cachelines with large numbers 
 * of inversions with a few bits left constant (nasty data) */


extern size_t gc_wr_asm( SMP_ptr start, size_t bytes );
extern size_t gc_rd_asm( SMP_ptr start, size_t bytes );

static size_t gc_wr( SMP_ptr start, size_t bytes )
{
    bytes /= smp_ncpus();                       /* split the work */
    start = (SMP_ptr)( (char *)start + bytes*smp_myid() );

    return gc_wr_asm( start, bytes );
}

static size_t gc_rd( SMP_ptr start, size_t bytes )
{
    bytes /= smp_ncpus();                       /* split the work */
    start = (SMP_ptr)( (char *)start + bytes*smp_myid() );

    return gc_rd_asm( start, bytes );
}




/*----------------------------------------------------------------------*/
/* icache test - execution of random NOPs through memory */
/* the theory behind this one is to turn the rest of memory into nops and
 * execute the lot */


extern uint64 noparray[8];               /* into 64-bit chunks */
extern unsigned retinsn;

#define BCSIZE (8<<20)

static size_t icache_wr( SMP_ptr start, size_t bytes )
{
    register SMP_ptr end;
    unsigned *iptr, *iend;              /* working at 32-bit granularity */

    /* SMP partitioning */
    bytes /= smp_ncpus();
    start = (SMP_ptr)( (char *)start + bytes*smp_myid() );
    end   = (SMP_ptr)( (char *)start + bytes );

    /* now call the asm to blast that into memory */
    dpattern_asm( start, bytes, noparray );

    /* so that we can properly test the Icache, each 32K is passed over
     * twice, so that the first pass is answered from memory, and the second
     * from the Icache.  NB 32K is 8<<10 instructions */

    for ( iptr=(unsigned *)start, iend=(unsigned *)end; iptr < iend; )
    {
        iptr += 8 << 10;
        *(iptr-1) = retinsn;                    /* pop in another ret */
    }

    imb();	/* synchronise I-stream with D-stream */

    return bytes;
}

extern size_t icache_rd_asm( SMP_ptr start, size_t bytes );

static size_t icache_rd( SMP_ptr start, size_t bytes )
{
    bytes /= smp_ncpus();
    start = (SMP_ptr)( (char *)start + bytes*smp_myid() );

    return icache_rd_asm( start, bytes ); 
}



/*----------------------------------------------------------------------*/
/* Strided memory test */
/* Stride over a memory region using a stride calculated to be large and 
 * exercise several separate areas in the memory region simultaneously.
 *
 * The exercising bit pattern is a set of rotating ones and zeroes
 */

static int is_prime( int val )
{
    unsigned i;
    for ( i=2; i*i <= val; i++ )
        if ( val % i == 0 )             return FALSE;       /* i is a factor */

    return TRUE;
}

static unsigned find_largest_prime( unsigned range )
{
    unsigned i;

    for ( i=range; i > 1; i-- )
        if ( is_prime( i ) )            return i;

    return 0;				/* failure case */
}



static unsigned prime=0, stride=0;
static uint64 strided_randval;

#define SIMULTANEOUS_BLKS 8

#define QW_ROL(val, n)	( ((val) << (64 - (n))) | ((val) >> (n) ) )

static size_t strided_wr( SMP_ptr start, size_t bytes )
{
    unsigned ncachelines = bytes / 64;
    register uint64 val0, val1, val2, val3, val4, val5, val6, val7;
    uint64 *clptr[SIMULTANEOUS_BLKS];
    register uint64 clval;
    unsigned i, j;
    unsigned myid = smp_myid();
    unsigned ncpus = smp_ncpus();


#if 1
    imb();		/* DEBUG - flush icache, make more work for CPU */
#endif

    /* Set up our test data, SMP problem partitioning */
    if ( smp_primary() )
    {
	/* a good stride */
	prime = find_largest_prime( 2*ncachelines / 3 );
	stride = prime * ncpus;
	srandom( rpcc() );		/* fresh random sequence */
	strided_randval = qrandom();	/* generate 64-bit random val */
    }
    smp_sync();				/* Everyone has the same data now */

    clval = (prime*myid) % ncachelines;

    val0 = strided_randval;
    val1 = QW_ROL(strided_randval, 1 );
    val2 = QW_ROL(strided_randval, 2 );
    val3 = QW_ROL(strided_randval, 3 );
    val4 = QW_ROL(strided_randval, 4 );
    val5 = QW_ROL(strided_randval, 5 );
    val6 = QW_ROL(strided_randval, 6 );
    val7 = QW_ROL(strided_randval, 7 );


    /* Fill memory */

    for ( i=0; i<ncachelines; i += SIMULTANEOUS_BLKS * ncpus )
    {
	/* First iterate in the series to find our next set of cache blocks */
	for ( j=0; j<SIMULTANEOUS_BLKS; j++ )
	{
	    /* Find our cachelines */
	    clval = (clval+stride) % ncachelines;
	    clptr[j] = (uint64 *)( (uint64)start + clval*64 );
	}

	/* Now iterated across each cacheblock writing each quadword in turn */
	for( j=0; j<8; j++ )
	{
	    /* Fill it and compute next cache line data in situ */
#define ADDRESS_EQUALS_DATA
#ifdef ADDRESS_EQUALS_DATA
	    clptr[0][j] = (uint64)&clptr[0][j] & 0xFFFFFFFFUL;
	    clptr[1][j] = (uint64)&clptr[1][j] & 0xFFFFFFFFUL;
	    clptr[2][j] = (uint64)&clptr[2][j] & 0xFFFFFFFFUL;
	    clptr[3][j] = (uint64)&clptr[3][j] & 0xFFFFFFFFUL;
	    clptr[4][j] = (uint64)&clptr[4][j] & 0xFFFFFFFFUL;
	    clptr[5][j] = (uint64)&clptr[5][j] & 0xFFFFFFFFUL;
	    clptr[6][j] = (uint64)&clptr[6][j] & 0xFFFFFFFFUL;
	    clptr[7][j] = (uint64)&clptr[7][j] & 0xFFFFFFFFUL; 	/* = SIMULTANEOUS_BLKS */
#else
	    clptr[0][j] = val0;
	    clptr[1][j] = val1;
	    clptr[2][j] = val2;
	    clptr[3][j] = val3;
	    clptr[4][j] = val4;
	    clptr[5][j] = val5;
	    clptr[6][j] = val6;
	    clptr[7][j] = val7; 	/* = SIMULTANEOUS_BLKS */
#endif
	}


#ifndef ADDRESS_EQUALS_DATA
	val0 = QW_ROL( val0, 8 );
	val1 = QW_ROL( val1, 8 );
	val2 = QW_ROL( val2, 8 );
	val3 = QW_ROL( val3, 8 );
	val4 = QW_ROL( val4, 8 );
	val5 = QW_ROL( val5, 8 );
	val6 = QW_ROL( val6, 8 );
	val7 = QW_ROL( val7, 8 );
#endif
    }

    return bytes / ncpus;
}


#undef STRIDED_READ
#ifdef STRIDED_READ
/* Reading back: we rely on ECC logic for error detection, but we still read 
 * back using a funky access pattern 
 */

static size_t strided_rd( SMP_ptr start, size_t bytes )
{
    unsigned ncachelines = bytes / 64;
    register uint64 sum;
    register uint64 *clptr;
    register uint64 clval;
    unsigned i;
    unsigned myid = smp_myid();
    unsigned ncpus = smp_ncpus();


#if 1
    imb();	/* DEBUG - flush icache, make more work for CPU */
#endif

    clval = (prime*myid) % ncachelines;

    for ( sum=0, i=0; i<ncachelines; i += ncpus )
    {
	/* Find our cacheline */
	clval = (clval+stride) % ncachelines;
	clptr = (uint64 *)( (uint64)start + clval*64 );

	/* Fill it and compute next cache line data in situ */
	sum ^= clptr[0];
	sum ^= clptr[1];
	sum ^= clptr[2];
	sum ^= clptr[3];
	sum ^= clptr[4];
	sum ^= clptr[5];
	sum ^= clptr[6];
	sum ^= clptr[7];
    }

    if ( sum != 0 )
	mobo_alertf("Bizarre!", "Memory checksum didn't work out right" );
	
    return bytes/ncpus;
}
#else							/* STRIDED READ */

/* Read back sequentially.  Just out of interest, every processor reads
 * all the data.  */
/* NOTE: the implementation below requires ADDRESS_EQUALS_DATA to be set */

static size_t strided_rd( SMP_ptr start, size_t bytes )
{
    register uint64 *ptr, *end;

#ifndef ADDRESS_EQUALS_DATA
#error This test requires address=data to be set in the write-out routine
#endif

    end = (uint64 *)((uint64)start + bytes);
    for ( ptr=(uint64 *)start; ptr < end; ptr++ )
    {
	if ( (DBM_SUPER | *ptr) != (uint64)ptr )
	{
	    mobo_alertf( "Woo-hoo!",
			 "At address 0x%016lx, I read data 0x%016lx\n",
			 ptr, *ptr );
	}
    }
    return bytes;
}
#endif							/* STRIDED READ */



/*----------------------------------------------------------------------*/
/* Cached data exchange test */
/* Write and read interleaved quadwords in the same cache blocks on 
 * different processors.  Should generate plenty of cache coherency testing
 */


static size_t cachexch_wr( SMP_ptr start, size_t bytes )
{
    register unsigned myid = smp_myid();
    register unsigned ncpus = smp_ncpus();
    register uint64 *ptr, *addr;
    register unsigned pass, cl, cl_last, mbyte, mb_last;
    register uint64 val;

    seed_cache[myid] = rpcc();
    mb_last = bytes >> 20;
    cl_last = (1UL<<20) / (sizeof(uint64)*QWDS_PER_CACHELINE);

    /* -------- Mutual exclusion -------- */
    /* random number generator is not MT-safe :-( */

    smp_acquire( &drandom_mutex );
    srandom( seed_cache[myid] );
    val = qrandom();
    smp_release( &drandom_mutex );

    /* -------- End of mutual exclusion -------- */

    /* We process 1MB of memory on all processors in lock-step */
    for ( mbyte = 0; mbyte < mb_last; mbyte++ )
    {
	ptr =(uint64 *)((char *)start + (mbyte<<20));

	/* Across all processors combined, we perform QWDS_PER_CACHELINE
	 * passes over the selected MByte */

	for ( pass=myid; pass<QWDS_PER_CACHELINE; pass+=ncpus )
	{
	    for ( cl=0; cl < cl_last; cl++ )
	    {
		addr = ptr + cl*QWDS_PER_CACHELINE + pass;
		*addr = (uint64)addr ^ val;
	    }
	}
	smp_sync();
    }
    return bytes / ncpus;
}


static size_t cachexch_rd( SMP_ptr start, size_t bytes )
{
    register unsigned myid = smp_myid();
    register unsigned ncpus = smp_ncpus();
    register uint64 *ptr = start;
    register unsigned pass, cl, cl_last;
    register uint64 val;

    /* Round down to the nearest mbyte, which is the write fn's work unit */
    bytes &= ~((1UL<<20) - 1);

    cl_last = bytes / (sizeof(uint64)*QWDS_PER_CACHELINE);

    /* -------- Mutual exclusion -------- */
    /* random number generator is not MT-safe :-( */

    smp_acquire( &drandom_mutex );
    srandom( seed_cache[myid] );
    val = qrandom();
    smp_release( &drandom_mutex );

    /* -------- End of mutual exclusion -------- */

    for ( pass=myid; pass<QWDS_PER_CACHELINE; pass+=ncpus )
    {
	for ( cl = 0; cl < cl_last; cl++ )
	{
	    ptr = (uint64 *)start + cl*QWDS_PER_CACHELINE + pass;

	    if ( *ptr != (val ^ (uint64)ptr) )
	    {
		mobo_alertf( "Discrepancy!",
			"0x%016lx: Read 0x%016lx expected 0x%016lx",
			ptr, *ptr, val ^ (uint64)ptr );
	    }
	}
    }
    return bytes / ncpus;
}

