// Debugging utility for looking up entries in the trie in the .blf file.
//
// Copyright 2001, KUN.
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Library General Public License for more details.
//
// You should have received a copy of the GNU Library General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

// $Id: findtrie.c,v 1.5 2003/11/26 22:15:13 pspiertz Exp $


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <sys/types.h>

enum { MAX_LINE = 1024 };
enum { WORD_SIZE = sizeof(char*) };

/*
//------------------------------------------------------------------------------
// Utilities
//------------------------------------------------------------------------------
*/

void my_abort(char* fmt, ...)
{
  va_list argp;

  va_start(argp, fmt);
  vfprintf(stderr, fmt, argp);
  va_end(argp);
  fputc('\n', stderr);
  exit(1);
}

char* chop(char* p)
{
  p[strlen(p) - 1] = '\0';
  return p; 
}

inline unsigned char* align(unsigned char* p)
{
  unsigned mod = (unsigned)p % WORD_SIZE;
  return mod ? p + WORD_SIZE - mod : p;
}

/*
//------------------------------------------------------------------------------
// Memory management
//------------------------------------------------------------------------------
*/

void* get_mem(unsigned size)
{
  void* mem = malloc(size);
  if (mem == NULL)
    my_abort("Out of memory");
  return mem;
}

void free_mem(void* mem)
{
  free(mem);
}

/*
//------------------------------------------------------------------------------
// Trie routines
//------------------------------------------------------------------------------
*/

enum { TRIE_PATH = 0, TRIE_SZ = 1, TRIE_HEADER_SZ = 2 };

typedef unsigned char*	trie_p;

trie_p new_trie(FILE* in)
{
  unsigned size;
  trie_p trie;

  fread(&size, 4, 1, in);
  fprintf(stderr, "trie size = %d\n", size);

  trie = (trie_p)get_mem(size);
  fread(trie, 1, size, in);
  return trie;
}

void delete_trie(trie_p trie)
{
  free_mem(trie);
}

/*
//------------------------------------------------------------------------------
// Function:
//	int bin_search(unsigned char c, unsigned char* p, unsigned size)
//
// Description:
//	Binary search for c in array p with size > 0.
//
// Return value:
//	Index of c in array, or -1 else.
//------------------------------------------------------------------------------
*/

inline int bin_search(unsigned char c, unsigned char* p, unsigned size)
{
  unsigned low = 0;
  unsigned high = size;
  do
  {
    unsigned mid = (low + high) / 2;
    unsigned char d = p[mid];
    if (c < d)
      high = mid;
    else if (c > d)
      low = mid + 1;
    else
      return mid;
  }
  while (low < high);
  return -1;
}

/*
//------------------------------------------------------------------------------
// Function: int find_word(unsigned char* str, trie_p root)
//
// Description:
//	Try to identify exact match of str in trie root.
//
// Return value:
//	If str is found, then return the info number associated with
//	str, or -1 else.
//------------------------------------------------------------------------------
*/

int find_word(unsigned char* str, trie_p root)
{
  unsigned char c;
  unsigned offset = 0;
  do
  {
    int index;
    unsigned char* trie = root + offset;
    unsigned path = trie[TRIE_PATH];
    unsigned size = trie[TRIE_SZ];
    trie += TRIE_HEADER_SZ;
    while (path--)
      if (*trie++ != *str++)
        return -1;
    c = *str++;
    index = bin_search(c, trie, size);
    if (index < 0)
      return -1;
    offset = *(unsigned*)align(trie + size + WORD_SIZE * index);
  }
  while (c);
  return (int)offset;
}

/*
//------------------------------------------------------------------------------
// Function:
//	int find_prefix(unsigned char* str, trie_p root, unsigned char** str2)
//
// Description:
//	Search longest nonempty prefix of string str in trie root.
//	The string pointed to by str2 is made to point to the first
//	character after the longest (possibly empty) prefix.
//
// Return value:
//	If a prefix of str is found, then return the info number
//	associated with the prefix, or -1 else.
//
// Note:
/	The algorithm starts with -1 as info, and str assigned to str2,
//	and updates info and str2 each time a prefix is found. When
//	no longer prefix can be found, info is returned.
//------------------------------------------------------------------------------
*/

int find_prefix(unsigned char* str, trie_p root, unsigned char** str2)
{
  unsigned char c;
  unsigned offset = 0;
  int info = -1;
  *str2 = str;
  do
  {
    int index;
    unsigned char* trie = root + offset;
    unsigned path = trie[TRIE_PATH];
    unsigned size = trie[TRIE_SZ];
    trie += TRIE_HEADER_SZ;
    while (path--)
      if (*trie++ != *str++)
        return info;
    if (trie[0] == '\0')
    {
      *str2 = str;
      info = *(int*)align(trie + size);
    }
    c = *str++;
    index = bin_search(c, trie, size);
    if (index < 0)
      return info;
    offset = *(unsigned*)align(trie + size + WORD_SIZE * index);
  }
  while (c);
  return info;
}

/*
//------------------------------------------------------------------------------
// Function:
//	int find_next_prefix(unsigned char* str, trie_p root, unsigned char** str2)
//
// Description:
//	Find all nonempty prefixes of string str in trie root.
//	If str is not NULL, the first (and shortest) prefix of str is
//	searched for, and the next prefix otherwise.
//	The string pointed to by str2 is made to point to the first
//	character after each (possibly empty) prefix.
//
// Return value:
//	For each nonempty prefix of str, the info number associated with
//	the prefix is returned. If no prefix can be found, or all prefixes
//	have been found, -1 is returned.
//
// Note:
//	At the first call, str should not be NULL. At subsequent calls,
//	str may be NULL, until -1 is returned.
/	The algorithm starts with -1 as info, and str assigned to str2,
//	and updates info and str2 each time a prefix is found. For each
//	prefix, info is returned.
//------------------------------------------------------------------------------
*/

int find_next_prefix(unsigned char* new_str, trie_p root, unsigned char** str2)
{
  static unsigned char* str = NULL;
  static unsigned offset = 0;

  unsigned char c;
  int info = -1;

  if (new_str)
  {
    *str2 = str = new_str;
    offset = 0;
  }
  else if (str == NULL)
    return -1;

  do
  {
    int index;
    unsigned char* trie = root + offset;
    unsigned path = trie[TRIE_PATH];
    unsigned size = trie[TRIE_SZ];
    trie += TRIE_HEADER_SZ;
    while (path--)
      if (*trie++ != *str++)
        return -1;
    if (trie[0] == '\0')
    {
      *str2 = str;
      info = *(int*)align(trie + size);
    }
    c = *str++;
    index = bin_search(c, trie, size);
    if (index < 0 || c == 0)
    {
      str = NULL;
      return info;
    }
    offset = *(unsigned*)align(trie + size + WORD_SIZE * index);
    if (trie[0] == '\0')
      return info;
  }
  while (c);
  str = NULL;
  return -1;
}

/*
//------------------------------------------------------------------------------
// Function:
//	int find_next_lexeme(enum LexemeType lex_type,
//	                     enum SeparatorType sep_type,
//	                     unsigned char* str,
//	                     trie_p root,
//	                     unsigned char** str2)
//
// Description:
//	Find all nonempty prefixes of string str with lex_type in trie root.
//	If str is not NULL, the first (and shortest) prefix of str is
//	searched for, and the next prefix otherwise.
//	If sep_type is SepTrue, then the prefix must be followed by
//	a character c for which is_separator(c) returns true. Else,
//	if sep_type is SepFalse, then is_separator(c) should return false.
//	A space in a lexeme matches one or more characters c for which
//	is_blank(c) returns true.
//
// Return value:
//	For each nonempty prefix of str, the info number associated with
//	the prefix is returned. If no prefix can be found, or all prefixes
//	have been found, -1 is returned.
//
// Side Effects:
//	The string pointed to by str2 is made to point to the first
//	character after each (possibly empty) prefix.
//
// Note:
//	At the first call, str should not be NULL. At subsequent calls,
//	str may be NULL, until -1 is returned. The function stores a
//	local state, starting with -1 as info, and str assigned to str2,
//	and updates info and str2 each time a prefix is found. For each
//	prefix, info is returned.
//------------------------------------------------------------------------------
*/

inline int is_separator(unsigned char c)
{
  switch (c)
  {
    case ' ':
    case '\t':
    case '\n':
    case '\0':
      return 1;
      break;
    default:
      return 0;
  }
}

inline int is_blank(unsigned char c)
{
  switch (c)
  {
    case ' ':
    case '\t':
    case '\n':
      return 1;
      break;
    default:
      return 0;
  }
}

enum LexemeType		{ Prefix, Infix, Suffix, MultiToken, WordForm };
enum SeparatorType	{ SepTrue, SepFalse, SepDontCare };

const char EmptyMark		= '\0';
const char PrefixMark		= '\1';
const char SuffixMark		= '\2';
const char InfixMark		= '\3';
const char MultiTokenMark	= '\4';

int find_next_lexeme(enum LexemeType lex_type,
                     enum SeparatorType sep_type,
                     const unsigned char* new_str,
                     trie_p root,
                     unsigned char** str2)
{
  static unsigned char* str = NULL;
  static unsigned offset = 0;

  int info = -1;

  unsigned char special_char = 0;	// prevent initialization warning
  unsigned char next_char;

	/*
	// Check whether first call for new string. If we have
	// a new string, maybe match a special character denoting
	// its type. If not, check whether longer prefix exists, by
	// continuing from the position reached by the previous call.
	*/
  if (new_str)
  {
    *str2 = str = (unsigned char*)new_str;
    offset = 0;
    switch (lex_type)
    {
      case Prefix:
        special_char = PrefixMark;
        break;
      case Infix:
        special_char = InfixMark;
        break;
      case Suffix:
        special_char = SuffixMark;
        break;
      case MultiToken:
        special_char = MultiTokenMark;
        break;
      case WordForm:
        special_char = EmptyMark;
        break;
    }
  }
  else if (str == NULL)
    return -1;
  else
    special_char = EmptyMark;

  do
  {
    int index;

	/*
	// Move to next trie, and read header
	*/
    unsigned char* trie = root + offset;
    unsigned path = trie[TRIE_PATH];
    unsigned size = trie[TRIE_SZ];
    trie += TRIE_HEADER_SZ;

  	/*
	// Match prefix path with input string. If we have a
	// special char denoting a special word type, match it first.
	// A space matches blank+.
	*/
    if (path > 0)
    {
      if (special_char != EmptyMark)
      {
        if (*trie++ != special_char)
        {
          str = NULL;
          return -1;
        }
        special_char = EmptyMark;
      }
      while (path--)
      {
        next_char = *trie++;
        if (next_char == ' ')
        {
          if (!is_blank(*str++))
          {
            str = NULL;
            return -1;
          }
          while (is_blank(*str))
            str++;
        }
        else if (*str++ != next_char)
        {
          /* should try lower case match here */
          str = NULL;
          return -1;
        }
      }
    }

	/*
	// Check whether we still need to match a special char.
	// If not, check whether we have a prefix, and get next
	// char from input string.
	*/
    if (special_char != EmptyMark)
    {
      next_char = special_char;
      special_char = EmptyMark;
    }
    else
    {
      if (trie[0] == '\0')
      {
        if ((sep_type == SepTrue  &&  is_separator(*str)) ||
            (sep_type == SepFalse && !is_separator(*str)) ||
            (sep_type == SepDontCare))
        {
          *str2 = str;
          info = *(int*)align(trie + size);
        }
      }
      next_char = *str++;
    }
    if (next_char == 0)
    {
      str = NULL;
      return info;
    }
    else if (is_blank(next_char))
    {
      while (is_blank(*str))
        str++;
      next_char = ' ';
    }

	/*
	// Try to match next character.
	*/
    index = bin_search(next_char, trie, size);
    if (index < 0)
    {
	/* should try lower case match here */
      str = NULL;
      return info;
    }

	/*
	// We have identified the next character; remember position
	// of next trie. If we had identified an info, return it.
	*/
    offset = *(unsigned*)align(trie + size + WORD_SIZE * index);
    if (info != -1)
      return info;
  }
  while (next_char);
  str = NULL;
  return -1;
}

/*
//------------------------------------------------------------------------------
// Function:
//	char* prompt(char* buf)
//
// Description:
//	Show prompt and read line from stdin into buf.
//
// Return value:
//	If line was read, return buf, or NULL else.
//------------------------------------------------------------------------------
*/

char* prompt(char* buf)
{
  fprintf(stderr, "findtrie> ");
  if (fgets(buf, MAX_LINE, stdin) && buf[0] != '\n')
    return buf;
  else
    return NULL;
}

void find_special(enum LexemeType lex_type, enum SeparatorType sep_type,
                  const unsigned char* buf, trie_p trie)
{
  unsigned char* p;
  int info;
  for (info = find_next_lexeme(lex_type, sep_type, buf, trie, &p);
       info >= 0;
       info = find_next_lexeme(lex_type, sep_type, 0, trie, &p))
  {
    unsigned char save = p[0];
    p[0] = '\0';
    printf("Found '%s', info = %d\n", buf, info);
    p[0] = save;
  }
}

void query_trie(trie_p trie)
{
  unsigned char buf[MAX_LINE];

  while (prompt((char*)buf))
  {
    chop((char*)buf);

#if 0
    int info;
    unsigned char* p;

	/*
	// Try to find exact match.
	*/
    info = find_word(buf, trie);
    if (info < 0)
      printf("No match\n");
    else
      printf("Match, info = %d\n", info); 

	/*
	// Try to find longest prefix.
	*/
    info = find_prefix(buf, trie, &p);
    if (info < 0) 
      printf("No prefix\n");
    else
    {
      unsigned char save = p[0];
      p[0] = '\0';
      printf("Longest prefix = '%s', info = %d\n", buf, info); 
      p[0] = save;
    }

	/*
	// Try to find all prefixes.
	*/
    for (info = find_next_prefix(buf, trie, &p);
         info >= 0;
         info = find_next_prefix(0, trie, &p))
    {
      unsigned char save = p[0];
      p[0] = '\0';
      printf("Prefix = '%s', info = %d\n", buf, info); 
      p[0] = save;
    }
#endif
	/*
	// Find special lexemes
	*/
    printf("Type Prefix, SepTrue:\n");
    find_special(Prefix, SepTrue, buf, trie);
    printf("Type Prefix, SepFalse:\n");
    find_special(Prefix, SepFalse, buf, trie);
    printf("Type Prefix, SepDontCare:\n");
    find_special(Prefix, SepDontCare, buf, trie);

    printf("Type Infix, SepTrue:\n");
    find_special(Infix, SepTrue, buf, trie);
    printf("Type Infix, SepFalse:\n");
    find_special(Infix, SepFalse, buf, trie);
    printf("Type Infix, SepDontCare:\n");
    find_special(Infix, SepDontCare, buf, trie);

    printf("Type Suffix, SepTrue:\n");
    find_special(Suffix, SepTrue, buf, trie);
    printf("Type Suffix, SepFalse:\n");
    find_special(Suffix, SepFalse, buf, trie);
    printf("Type Suffix, SepDontCare:\n");
    find_special(Suffix, SepDontCare, buf, trie);

    printf("Type WordForm, SepTrue:\n");
    find_special(WordForm, SepTrue, buf, trie);
    printf("Type WordForm, SepFalse:\n");
    find_special(WordForm, SepFalse, buf, trie);
    printf("Type WordForm, SepDontCare:\n");
    find_special(WordForm, SepDontCare, buf, trie);

    printf("Type MultiToken, SepTrue:\n");
    find_special(MultiToken, SepTrue, buf, trie);
    printf("Type MultiToken, SepFalse:\n");
    find_special(MultiToken, SepFalse, buf, trie);
    printf("Type MultiToken, SepDontCare:\n");
    find_special(MultiToken, SepDontCare, buf, trie);
  }
}

void find_all_prefixes(FILE* in_file, trie_p trie)
{
  unsigned char buf[MAX_LINE];

  while (fgets((char*)buf, MAX_LINE, in_file))
  {
    int info;
    unsigned char* p;

    chop((char*)buf);

	/*
	// Try to find all prefixes.
	*/
    info = find_next_prefix(buf, trie, &p);
    if (info < 0)
      // typecast necessary with gcc 3.0.3:
      // findtrie.c:676: error: invalid conversion from `unsigned char*' to `const char*`
      printf("%us\n", (const char*) buf);
    else
      while (info >= 0)
        info = find_next_prefix(0, trie, &p);
  }
}

int main(int argc, char* argv[])
{
  FILE* trie_file;
  trie_p trie = NULL;

	/*
	// Check arguments
	*/
  if (argc < 2 || argc > 3)
    my_abort("Usage: findtrie <trie-file> [ <in-file> ]");

	/*
	// Read trie
	*/
  trie_file = fopen(argv[1], "rb");
  if (trie_file == NULL)
    my_abort("Error: cannot open file `%s'", argv[1]);
  trie = new_trie(trie_file);

	/*
	// If input file specified, find prefixes, or
	// interactive querying else.
	*/
  if (argc == 2)
    query_trie(trie);
  else if (argc == 3)
  {
    FILE* in_file = fopen(argv[2], "rb");
    if (in_file == NULL)
      my_abort("Error: cannot open file `%s'", argv[2]);
    find_all_prefixes(in_file, trie);
  }

  delete_trie(trie);
  return 0;
}
