/*
   File: lexicon_search.c
   searching in the lexicon.

   Copyright 2005 Radboud University of Nijmegen
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU Library General Public License for more details.
 
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

   CVS ID: "$Id: lexicon_info.c,v 1.19 2006/09/27 16:01:35 marcs Exp $
*/

#define MONOLITIC_LIST 0

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */

/* standard include files */
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>

/* agfl include files */
#include <abase_memalloc.h>
#include <abase_error.h>
#include <lexicon.h>

/* local include files */
#include "opt.h"
#include "search.h"
#include "edit_distance.h"

static void
be_verbose (void)
{
   if (verbose) {
      abs_printf ("%s\n", get_name());
      abs_printf ("  max edit distance: %d\n", max_edit_distance);
      abs_printf ("  use of edit distance list is %s\n", with_edit_distance_list ? " on" : "off");
      abs_printf ("  lexicon: %s\n", get_lexicon_file_name());
   }
}

static Lexicon read_lexicon (char *name)
{
   FILE *fh;
   Lexicon lexicon;

   fh = fopen (name, "r");
   if (fh == NULL) abs_fatal ("Can't open lexicon file %s for reading.", name);
   lexicon = lexicon_new (fh);
   fclose (fh);
   return (lexicon);
}

unsigned long wordc = 0;  /* word count */
unsigned int wordl = 0;   /* max word length */
unsigned char **wordv;    /* list of words */
unsigned long ed[ED];     /* the smallest edit distances */
#if MONOLITIC_LIST
unsigned long *worde;     /* list of edit distances */
#else
unsigned long **worde;
#endif

#define s(x) ((x)*((x) - 1)/2)

static int 
count_node (int mode, unsigned char *p)
{
   wordc++;
   return 1;
}

static int
copy_node (int mode, unsigned char *p)
{
   int l;

   l = strlen (p);
   if (wordl < l) wordl = l;
   wordv[wordc++] = abs_new_string(p, "copy_node");
   return 1;
}

#if MONOLITIC_LIST
static int *
new_edit_distance_table (int count)
{
   if (with_edit_distance_list) return (unsigned long *) abs_calloc (sizeof (unsigned long *), s(count), "new_edit_distance_table");
   return NULL;
}
#else
static unsigned long **
new_edit_distance_table (unsigned long count)
{
   unsigned long i;
   unsigned long **list;
  
   if (!with_edit_distance_list) list = NULL;
   else
   {
      list = (unsigned long **) abs_calloc (sizeof (unsigned long *), count, "new_edit_distance_table");
      for (i=1; i<count; i++) list[i] = (unsigned long *) abs_calloc (sizeof (unsigned long), i, "main");
   }
   return (list);
}
#endif

static void
#if MONOLITIC_LIST
release_edit_distance_table (unsigned long *list, int count)
{
   if (!with_edit_distance_list) return;
   abs_free (list, "release_edit_distance_table");
}
#else
release_edit_distance_table (unsigned long **list, unsigned long count)
{
   unsigned long i;
   
   if (!with_edit_distance_list) return;
   for (i=1; i<count; i++) abs_free (list[i], "release_edit_distance_table");
   abs_free (list, "release_edit_distance_table");
}
#endif   

static inline void
set_edit_distance (unsigned long x, unsigned long y, int e)
{
   if (!with_edit_distance_list) return;
#if MONOLITIC_LIST
   if (x > y) worde[s(x) + y] = e;
#else
   if (worde != NULL) worde[x][y] = e;
#endif
}

static inline int
get_edit_distance (unsigned long x, unsigned long y)
{
   if (!with_edit_distance_list) return -1;
#if MONOLITIC_LIST
   if (x > y) return worde[s(x) + y];
   else return -1;
#else
   return worde[x][y];
#endif
}

#define IDX(i,j,d) (((i)*wordl + (j))*ED + (d))

static void
display_c(unsigned long *lld)
{
  int i, j, d;
  time_t t;
  
  DB_LEX(printf ("%d:%d:%d\n", wordl, wordl, ED);)
  time (&t);
  printf ("\n%s", ctime (&t));
  for (i=1; i<wordl; i++) 
  {
     for (j=1; j<wordl; j++) 
     {
        printf ("%d:%d", i, j);
        for (d=1; d<ED; d++) printf (":%lu", lld[IDX(i,j,d)]);
        printf ("\n");
     }
  } 
}

#define INTERVAL      10000000

static inline void
sign_of_life (unsigned long wordc, unsigned long i)
{
   double percentage;

   /* mind the overflow! */
   percentage = 100.0;
   percentage *= i;
   percentage /= wordc;
   percentage *= i - 1;
   percentage /= wordc - 1;
   abs_printf ("\rdone %.2f%%, (%5ld of %5ld words)", percentage, i, wordc);
}

#define pNONE   0
#define pFIRST  1
#define pSECOND 2

int main (int argc, char *argv[])
{
   Lexicon lexicon;
   Trie trie;
   int huge;
   int interval_count = 0;
   int d;
   int pc = pNONE;
   unsigned char c;
   unsigned long i, j;
   int li, lj;
   EDE *h;
   unsigned long edj[ED];
   unsigned long *lld=NULL;
   FILE *p = NULL;
   char *pname = NULL;

   process_command_line(argc, argv);
   be_verbose ();
   lexicon = read_lexicon (get_lexicon_file_name());
   trie = lxcn_get_lexicon_trie (lexicon);
 
   walk_tree (modeEMPTY, trie, 0, "", count_node);
   huge = (LONG_MAX/wordc < wordc);
   wordv = (unsigned char **) abs_calloc (sizeof (unsigned char *), wordc, "main");
   worde = new_edit_distance_table (wordc);
   wordc = 0;
   walk_tree (modeLEAF, trie, 0, "", copy_node);

   lexicon_free (lexicon);

   if (with_l) 
   {
      printf ("[wordlist]\n");
      for (i=0; i<wordc; i++) printf("%lu : %s\n", i, wordv[i]);
   }   
   if (with_c) 
   {
      lld = (unsigned long *) abs_calloc (sizeof (unsigned long), wordl*wordl*ED, "main");
      for (i=0; i<wordl*wordl*ED; i++) lld[i] = 0;
   }
   if (with_p)
   {
      char *name = get_lexicon_file_name();
      pname = abs_malloc(strlen(name) + 1 + 2, "lexicon_file_name");
      sprintf (pname, "%s.p", name); 
      p = fopen (pname, "wb");
      if (p == NULL) abs_error ("Can't open output file %s, aborting.", pname);
      fwrite (pname, sizeof (char), strlen(pname)+1, p);
      free (pname);
   }

   h = NULL;
   if (verbose && wordc > INTERVAL) 
   {
      interval_count = 0;
      abs_printf("\n");
   }
   if (with_d) 
   {
      for (i=0; i<ED; i++) ed[i] = 0;
      printf ("# lexicon: %s\n", get_lexicon_file_name());
      printf ("# format:\n# ");
      for (j=1; j<ED; j++) printf ("ed %lu : ", j);
      printf ("n : length : word\n");
   }
   if (with_l) 
   {
      printf ("\n[l X l]\n");
      printf ("# i1 : l1\n");
      printf ("# (i2 : l2 : d){i1}\n");
   }
   if (p != NULL) fwrite (&wordc, sizeof (unsigned long), 1, p);

   /* the actual loops */
   for (i=0; i<wordc; i++)
   {
      if (with_d) for (j=0; j<ED; j++) edj[j] = 0;
      if (with_l) printf ("%lu:%ld\n", i, strlen (wordv[i]));
      li = strlen (wordv[i]);
      for (j=0; j<i; j++)
      {
         interval_count++;
         if (verbose && (interval_count >= INTERVAL))
         {
            interval_count = 0;
            sign_of_life (wordc, i);
            if (with_c) display_c(lld);
         }
         lj = strlen (wordv[j]);

         h = edit_distance_renew_environment (h, wordv[i], wordv[j]); 
         d = edit_distance_get_limited_distance (h, max_edit_distance);
         if ((with_d) && (0 <= d) && (d < ED)) 
         {
            ed[d]++;
            edj[d]++;
         }
         if (with_l) printf ("%lu:%d:%d\n", j, lj, d);
         if ((with_c) && (d > 0) && (d < ED)) lld[IDX(li,lj,d)]++;
         if (p != NULL) 
         {  
            if (pc == pNONE) 
            { 
               c = (d % 16) << 4;
               pc = pFIRST;
            }
            else 
            {
               c |= (d % 16);
               fwrite (&c, sizeof (unsigned char), 1, p);
               pc = pNONE;
            }
         }
         set_edit_distance (i, j, d);
      }
      if (with_d) 
      {
         for (j=1; j<ED; j++) printf ("%lu : ", edj[j]);
         printf ("%lu : %d : %s\n", i, li, wordv[i]);
      }
   }
   edit_distance_release_environment (&h, "main");

   if (verbose) 
   {
      abs_printf ("\n");
      for (i=0; i<wordc; i++) abs_printf("word: '%s'\n", wordv[i]);
   }
   if (with_d)
   {
      printf ("\nfor lexicon: %s\n", get_lexicon_file_name());
      if (huge) printf ("found %lu words. (and %lu*(%lu - 1)/2 edit distances)\n", wordc, wordc, wordc);
      else printf ("found %lu words. (and %lu edit distances)\n", wordc, s(wordc));
      printf ("max word lenght %d.\n", wordl);
      j = (wordl < max_edit_distance) ? wordl : max_edit_distance;
      if (j > ED) j = ED;
      printf ("edit distance count for the first %lu distances\n", j-1);
      for (i=1; i<j; i++) printf ("edit distance %lu, found %lu hits. (%.2f%%)\n", i, ed[i], (double) ed[i]*200.0/wordc/(wordc - 1));
   }
   if (with_c) display_c(lld);
   if ((p != NULL) && (pc != pNONE)) fwrite (&c, sizeof (unsigned char), 1, p);

   for (i=0; i<wordc; i++) abs_free(wordv[i], "main");
   release_edit_distance_table (worde, wordc);
   abs_free (wordv, "main");
   abs_free (lld, "main");
   if (p != NULL) fclose (p);
   return 0;
}
