/************************************************************************
    Modified winnow algorithm.  Permission to distribute is granted.
    (C) Copyright Avrim Blum 1995


    Each "specialist" is a pair of variable values.  I.e., a rule of size 2.

    Rules abstain when they're first seen.  They store the last 5 outcomes.

    If "accuracy/converage" is NOT selected, then each rule predicts
    by using a majority vote over its history.  If is IS selected, then
    each rule distributes its weight among the outcomes in its history (so
    if one outcome appears 3 times and one appears twice, then 3/5 of the
    weight goes to the first outcome and 2/5 to the other one).
    The outcome predicted by the global algorithm is the winner in the
    (weighted) vote.

    In either case above, rules are rewarded or penalized based on the
    result of their majority vote.  We penalize even when global doesn't
    make a mistake.

    Note: multiplying by 0.5 to penalize and by 1.5 to reward.  Experiments
    do not seem to be sensitive to exact quantities.

    Allow to "brain-damage" by forcing a batch mode: don't use any information 
    learned in the day of the current example for making your prediction.

    Command line arguments allowed:
     first arg is feature set, second is what to predict,
     third is output file and fourth is data file.
     Fifth is when to stop.
     sixth, if it exists, is 'b' to "brain-damage" by running in batch mode or
     's' to split votes for improved accuracy/coverage tradeoff.

     E.g.,
     varval_pairs 0 req-location outputfile mitchell-data 1685 s

     If you use command line args, then program runs in non-interactive mode.

   Interactive mode commands:
     "s" is number to skip.
     "z" to zero out the mistake values.
     "p" to print the weights to all choices.
     output: prints out predictors for that output 
     "q" to quit.

     There is a global default of "*******" that we predict if no specialist
     says anything.

******************************************************************************/

#include <stdio.h>
#include <math.h>
#include <ctype.h>
#include <strings.h>
#include "header.h"

/* DEFINES  */
#define MAX_NUM_TUPLES 595  /* MAX_FEATURES choose TUPLE_SIZE */
#define TUPLE_SIZE 2    /* using pairs  */

#define multiplier 1.5
#define penalty 0.5
#define AMT 5    /* store last 5 things per.  */

#define NUM_BOXES 20  /* for accuracy/coverage */

/* This struct defines a "specialist".  the name field tells the
   values that activate the specialist.  The remaining fields store
   the last up-to AMT outputs, the actual number of things in the "history" 
   array, the number of mistakes and correct predictions it's made, as
   well as other information.
 */
typedef struct specialist_struct {
  example_t name[TUPLE_SIZE];  /* array of names */
  int history[AMT];     /* indices to last AMT correct answers
				in the global_out_array */
  int historysize;           /* number seen. At most AMT */
  int mistakes;
  int corrects; 
  int correct_when_global_failed;

  double weight;
  int majority;
  double oldweight; 
  int oldmajority, update_date;  /* for brain-damaged learner */
  struct specialist_struct *next;
} specialist_t;

/* for each pair of features, we will store all the specialists that 
   correspond to settings of those features (e.g., event-type == meeting
   and number-of-person == 1) in a hash table.
 */
#define HASH_SIZE 101
typedef specialist_t *spec_table[HASH_SIZE];


/* globals */
FILE *fileptr, *summaryfp = NULL;
int findex;       /* which feature set are we using? */
int NUM_INPUTS;   /* how many inputs do we actually have? */
int NUM_TUPLES;   /* how many tuples do we actually have? */
spec_table all_specialists[MAX_NUM_TUPLES]; /* one table for each tuple */

char global_out_array[maxoutputs][MAX_FEATURES]; /* global array containing
						 names of the output values. */
int num_outputs = 0;
double votes[maxoutputs];           /* votes for each possible answer
                                       Global so that can use for printing */
char *default_ans;                  /* using a SINGLE, GLOBAL default here */
int our_mistakes[NUM_BOXES];        /* total number of mistakes of us */
example_t inputs[MAX_FEATURES];     /* holds list of input features */
specialist_t *specialist[MAX_NUM_TUPLES]; /* pointers to awake specialists */
example_t correct_ans;           /* holds correct answer */
int correct_index;               /* index of correct_ans in global_out_array*/
char *our_prediction;            /* holds prediction */
double our_prediction_weight, our_total_weight;  /* what were the weights? */
int seen_so_far[NUM_BOXES];      /* seen_so_far[0] = # examples seen. */
int we_made_mistake;             /* did we just make a mistake? */
int brain_damage = 0;            /* default is NO brain damage */
int interactive = 1;             /* interactive mode? */
int splitvotes = 0;  /* split votes for improved accuracy/coverage tradeoff? */
int OUTPUTTIME = 1685;     /* when to output */

/* functions that return values */
char *concat_entries(); 
specialist_t *lookup(spec_table t, char *s[TUPLE_SIZE]);
specialist_t *insert(spec_table t, char *s[TUPLE_SIZE]);

int main(int argc, char **argv)
{
  int start = 1, r;
  init(argc, argv); /* initialize. Read through first example name. */
  
  while ((r=read_current_example(fileptr, the_feature_set[findex].features,
				 the_feature_set[findex].num_features,
				 inputs)) != -1) {
    if (r <-1) {
      if (r==NORESULT) printf("no result in ");
      else printf("couldn't find all features in ");
      printf("example %s.\ngoing on to next example....\n",example_name);
      find_next_desired_event(fileptr);
      continue;
    }
    global_predict();  /* predict */
    evaluate_result();  /* evalutate results and update globals */
    pretty_print(); /* print out info.  Also, update number of mistakes */

    update_specs();   /* update specialists */

    find_next_desired_event(fileptr);  /* read up to next add/copy event */
  }
  return 0;
}


/* fills the specialist array.  If input is new, puts into all_specialists */
map_specialists()
{
  int i,j;
  int tuple[TUPLE_SIZE];
  char *values[TUPLE_SIZE];
  specialist_t *ptr;

  /* initialize tuple */
  for(j=0; j < TUPLE_SIZE; ++j) tuple[j] = j;

  for(i=0; i < NUM_TUPLES; ++i) {             /* for each tuple */
    for(j=0; j < TUPLE_SIZE; ++j) values[j] = inputs[tuple[j]];
    ptr = lookup(all_specialists[i], values); /* find it */
    if (!ptr)                                 /* not there so create new one*/
      ptr = insert(all_specialists[i], values);
    specialist[i] = ptr;
    increment_tuple(tuple);   /* increment the tuple */
  }
}


/* increments an array of integers indicating a subset of TUPLE_SIZE elements
   of {0,...,NUM_INPUTS-1}.  Returns 1 if can, or 0 if we already had the last 
   one.  Eg. TS = 2, NI = 5.  Then, progression is:
                           01,02,03,04,12,13,14,23,24,34.
 */
increment_tuple(val_array)
     int val_array[TUPLE_SIZE];
{
  int i,j;
  for (j=TUPLE_SIZE-1; j > -1; --j)
    if (val_array[j] < NUM_INPUTS - (TUPLE_SIZE - j)) {
      ++val_array[j];
      for(i=j+1; i < TUPLE_SIZE; ++i)
	val_array[i] = val_array[i-1] + 1;
      return( 1 );
    }
  return( 0 );
}


/* looks at all awake specialists and then makes up prediction 
   based on the multiplier and penalty on number of corrects/incorrects.
   Weight is (multiplier)^{# correct_when_global_failed} * 
   (penalty)^{# mistakes}

   For simplicity, this communicates its results using global variables.

 */ 

global_predict(void)
{
  int i,j, best_ans;
  int currentdate;
  double maxweight = 0.0, total_weight = 0.0;
  specialist_t *ptr;

  for(j=0; j < num_outputs; ++j) votes[j] = 0.0;   /* start at 0 */
  map_specialists();    /* fill the input_var array */
  currentdate = get_current_date();

  for(i=0; i < NUM_TUPLES; ++i) {             /* for each input */
    ptr = specialist[i];
    if (splitvotes)
      for(j=0; j < ptr->historysize; ++j) {   /* split votes(no brain damage)*/
	votes[ptr->history[j]] += ptr->weight/ptr->historysize;
      }
    else if (ptr->historysize > 0) {          /* vote with majority */
      if (!brain_damage) votes[ptr->majority] += (double) ptr->weight;
      else {                                  /* brain damage       */
	if (currentdate != ptr->update_date) {/* can now use the most recent*/
	  ptr->oldmajority = ptr->majority;  
	  ptr->oldweight = ptr->weight;
	  ptr->update_date = currentdate;
	}
	votes[ptr->oldmajority] += (double) ptr->oldweight;
      }
    }
  }
  
  for(j=0; j < num_outputs; ++j) {            /* find the winner */
    total_weight += votes[j];
    if (votes[j] > maxweight) {
      maxweight = votes[j];
      best_ans = j;
    }
  }
  if (maxweight > 0) {              /* set up global variables */
    our_prediction_weight = maxweight;
    our_total_weight = total_weight;
    our_prediction = global_out_array[best_ans];
  } else {
    printf("using default\n");
    our_prediction_weight = 1.0; our_total_weight = 1.0;
    our_prediction = default_ans;
  }
}

/* this updates the specialists.

  Note: potential for overflow in way things are done here.
  To be safe, should be done with logs. 

*/
update_specs()
{
  int i,j;
  specialist_t *ptr;

  for(i = 0;  i < NUM_TUPLES; ++i) {
    ptr = specialist[i];
    /* update number of mistakes and correct answers if it predicted*/
    if (ptr->historysize > 0) { 
      if (ptr->majority == correct_index) {
	++(ptr->corrects);
	if (we_made_mistake) ++(ptr->correct_when_global_failed);
      } 
      else ++(ptr->mistakes);

      ptr->weight = pow(multiplier, (double)
			(ptr->correct_when_global_failed)) * 
			  pow(penalty, (double) (ptr->mistakes));
    } else {   
      ptr->weight = 1.0;   /* this is to start things. */
    }

    /* put correct answer into array */    
    if (ptr->historysize < AMT)  ++(ptr->historysize);  /* used near start */

    for(j = (ptr->historysize) - 1; j > 0; --j)
      ptr->history[j] = ptr->history[j-1];
    ptr->history[0] = correct_index;
    
    ptr->majority = best_choice(ptr->history,ptr->historysize); /*update*/
  }
}


/* this prints out information:
   Our top choices and their weights, if desired.
   Also, can request for results of all algs that predicted now.

 */
pretty_print()
{
  int i,j,mistakes, top_four[4], cur_index;
  double min_val, max_val;
  char string[100];
  specialist_t *ptr;
  static int skipnumber = 0;  /* skip this many examples before printing */

  if (seen_so_far[0] == OUTPUTTIME) { /* print out and quit */
    fprintf(summaryfp,
    "name                            number mistakes fraction_correct\n");
    fprintf(summaryfp,"%s\t",example_name);
    fprintf(summaryfp,"%5d\t", seen_so_far[0]);
    fprintf(summaryfp,"%5d \t",our_mistakes[0]);
    fprintf(summaryfp,"%.3f\t\n",1.0-our_mistakes[0]/((float)seen_so_far[0]));
    fprintf(summaryfp,"(coverage, accuracy) results:\n");
      
    for(i=0; i < NUM_BOXES; ++i) {
      fprintf(summaryfp,"(%.3f %.3f) ",((float) seen_so_far[i]) / 
	      seen_so_far[0], 1.0-our_mistakes[i]/((float) seen_so_far[i]));
    }
    fprintf(summaryfp,"\n\n");
      
    exit(0); /* Might as well stop now */
  }
  
  if (!interactive || skipnumber > 0) {   /* don't print */
    --skipnumber;
    return;
  }

  /* print out data */
  printf("Example %d: %s\n", seen_so_far[0], example_name);
  for(i=0; i < NUM_INPUTS; ++i) 
    printf("%s: %s\n",the_feature_set[findex].features[i],inputs[i]);

  printf("Our prediction: %s    Correct: %s\n", our_prediction, correct_ans);
  printf("             %d mistakes so far\n", our_mistakes[0]);

  /* print out top 4 choices and weights */
  printf("top 4 choices and their weights:\n   ");
  for(min_val = 0.0, max_val = NUM_TUPLES, i=0; i < 4; ++i) {
    cur_index = -1;
    for(j=0; j < num_outputs; ++j)
      if (votes[j] > min_val && votes[j] < max_val) {
	cur_index = j;
	min_val = votes[j];
      }
    top_four[i] = cur_index;
    max_val = min_val;
    min_val = 0.0;
  }
  for(i=0; i < 4; ++i) {
    if (top_four[i] > -1)
      printf("(%s %.3f) ",global_out_array[top_four[i]],votes[top_four[i]]);
    else printf("(%s 0.0) ",default_ans);
  }
  printf("\n\n");
	
  /* now, get return to continue, or a command to print out into. */
  if (seen_so_far[0]==1) {
    printf("type <cr> for next, <num>s to skip printing of next <num>\n");
    printf("examples, p to print weights on all choices,\n");
    printf("P to print all current votes, z to zero counts\n");
    printf("and q to quit.\n");
  }
  while(1) {
    gets(string);
    if (string[0] == '\0') return;
    /* On "z", zero out algorithm mistakes.  */
    if (string[0] == 'z') {
      our_mistakes[0] = 0; 
      continue;
    }
    /* on "p", print out weights for all choices */
    if (string[0] == 'p') {
      for(i=0; i < num_outputs; ++i) {
	printf("%s: %g, ",global_out_array[i],votes[i]);
      }
      printf("\n");
      continue;
    }
    if (string[0] == 'q') exit(0);
    
    if (string[strlen(string)-1] == 's' && sscanf(string,"%d",&j))  {
      skipnumber = j-1;
      return;
    }
    if (string[0] == 'P') {
      for(j = 0;  j < NUM_TUPLES; ++j) { /* for each specialist */
	ptr = specialist[j];
	if (ptr->historysize == 0) continue;
	printf("[");
	for(i=0; i < TUPLE_SIZE - 1; ++i) printf("%s ",ptr->name[i]);
	printf("%s] ",ptr->name[TUPLE_SIZE - 1]);
	printf("predicts %s. ",
	      global_out_array[best_choice(ptr->history,ptr->historysize)]);
	printf("Weight: %.3f\n",ptr->weight);
	printf("   last %d: ",ptr->historysize);
	for(i=0; i < ptr->historysize; ++i) 
	  printf("%s, ",global_out_array[ptr->history[i]]);
	printf("\n");
      }
      continue;
    }
    printf("unknown command/algorithm name.\n");
    continue;
  }
}




/*  Finds out data file and initializes things */
init(int argc, char** argv)
{
  int i,j;
  char filename[100], *ptr, junk[100];
  if (argc == 2) { /* print out help */
    printf(" \
  First arg is feature set: 0=loc,1=dur,2=start,3=d.o.w,4=big\n \
  Second is what to predict, Third is output file, Fourth is data file\n\
  Fifth is when to stop\n\
  Sixth (if it exists) is 'b' to brain-damage by forcing batch mode,\n\
  and 's' to split votes for improved accuracy/coverage tradeoff\n");
    exit(0);
  }
  if (argc >=3) {
    sscanf(argv[1],"%d",&findex);
    sscanf(argv[2],"%s",to_predict);
    interactive = 0;  /* non-interactive mode */
  } else {
    /* find which feature set to use */
    printf("avaliable feature sets: ");
    for(i=0; i < num_feature_sets; ++i)
      printf("%s (%d), ", the_feature_set[i].name, i);
    printf("\nenter desired feature set number: ");
    scanf("%d",&findex);
    printf("what feature do you want to predict (type the name)? ");
    scanf("%s",to_predict);
    gets(junk);
  }
  NUM_INPUTS = the_feature_set[findex].num_features;
  for(NUM_TUPLES=1,i=0; i < TUPLE_SIZE; ++i)
    NUM_TUPLES *= (NUM_INPUTS - i);
  for(i=0; i < TUPLE_SIZE; ++i) NUM_TUPLES = NUM_TUPLES/(i+1);
  printf("%d tuples.\n",NUM_TUPLES);
  
  if (argc >= 4)
    strcpy(filename, argv[3]);
  else {
    printf("output file for summary (<cr> for none): ");
    gets(filename);
  }
  if (*filename) {
    if ((summaryfp = fopen(filename,"a")) == NULL)
      printf("can't open file. Not creating summary.\n");
  } else {
    summaryfp = stdout;
  }
  fprintf(summaryfp,"predicting %s with %s\n",to_predict,
	  the_feature_set[findex].name);
  if (argc >= 5) 
    strcpy(filename, argv[4]);
  else {
    printf("Give data file: ");
    gets(filename);
  }
  if (*filename == '\0' || (fileptr = fopen(filename,"r")) == NULL) {
    printf("can't open file '%s'.\n", filename);
    exit(1);
  }

  /* initialize global vars */
  for(i=0; i < NUM_BOXES; ++i) {
    seen_so_far[i] = 0;   /* for confidence box i, how many has it tried */
    our_mistakes[i] = 0;  /* for confidence box i, how many has it failed */
  }


  /* initialize default */
  default_ans = "*******";

  /* old program used to ask for start date, but now just start at beginning*/
  read_upto_date(fileptr,1,1,1);

  if (argc >=6) sscanf(argv[5], "%d",&OUTPUTTIME);
  else if (interactive && summaryfp != stdout) {
    printf("Time to output (or <cr> for %d): ", OUTPUTTIME);
    gets(junk);
    if (*junk) sscanf(junk,"%d", &OUTPUTTIME);
  }
  if (argc >= 7) {
    if (*argv[6] == 'b') brain_damage = 1;
    if (*argv[6] == 's') splitvotes = 1;
  } else if (interactive) {
    printf("Enter 'b' to brain-damage learner, 's' to split votes for better\n\
            accuracy/coverage tradeoff, or <cr> for neither: ");
    gets(junk);
    if (*junk == 'b') brain_damage = 1;
    if (*junk == 's') splitvotes = 1;
  }
  if (brain_damage) fprintf(summaryfp,"brain-damage: using batch method\n");
  if (splitvotes) fprintf(summaryfp,"using split-votes method\n");
}


/* get index of correct answer
   Update global variables.

   This also should be the ONLY function (besides the pretty printer)
   that looks at the correct answer. 
 */
evaluate_result(void)
{
  int i,j;
  double mult;

  we_made_mistake = (strcmp(our_prediction,correct_ans) == SAME) ? 0 : 1;

  /* for each "confidence box", update number of predictions,mistakes */
  /* box i only looks at cases where best/total >= 1 - 0.8^i.         */
  for(i=0, mult = 1.0; i < NUM_BOXES; ++i, mult *= 0.8)
    if (our_prediction_weight/our_total_weight >= 1.0 - mult) {
      ++seen_so_far[i];
      if (we_made_mistake) ++our_mistakes[i];
    }


  for(j=0; j < num_outputs; ++j) {
    if (strcmp(correct_ans, global_out_array[j]) == SAME) {
      correct_index = j;
      break;
    }
  }
  if (j == num_outputs) {  /* it is a NEW output */
    strcpy(global_out_array[num_outputs], correct_ans);
    correct_index = num_outputs;
    ++num_outputs;
  }
}

/* gets current date and store into one integer mmddyy */
int get_current_date(void)
{
  char *ptr;
  int month, day, year, result;
  for(ptr = example_name; *ptr != '-'; ++ptr); /* go to first dash */
  sscanf(ptr,"-event-%d-%d-%d",&month, &day, &year); /* now get month,day,year */
  result = month * 10000 + day * 100 + (year % 100);
  return result;
}


/**************hash table functions*************/
int hash_situation(char *s[TUPLE_SIZE])
{
  int i,h;
  char *ptr;
  for(i=0, h=0; i < TUPLE_SIZE; ++i) {
    for(ptr = s[i]; *ptr != '\0'; ++ptr) h = (h*4 + *ptr) % HASH_SIZE;
  }
  return h;
}

specialist_t *lookup(spec_table t, char *s[TUPLE_SIZE])
{
  int i, index = hash_situation(s);
  specialist_t *ptr;
  for(ptr = t[index]; ptr; ptr = ptr->next) {
    for(i=0; i < TUPLE_SIZE && strcmp(ptr->name[i],s[i]) == SAME; ++i);
    if (i == TUPLE_SIZE) return ptr;
  }
  return NULL;
}
    
specialist_t *insert(spec_table t, char *s[TUPLE_SIZE])
{
  int i,index = hash_situation(s);
  specialist_t *ptr;
  ptr = (specialist_t *) calloc(1, sizeof(specialist_t));
  for(i=0; i < TUPLE_SIZE; ++i) strcpy(ptr->name[i], s[i]);/* copy data over */
  ptr->next = t[index];
  t[index] = ptr;
  return ptr;
}
