/*
   File: lxcn_nfa.c
   Defines the nondeterministic automaton used for fuzzy matching

   Copyright 2009 Radboud University of Nijmegen
 
   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id$"
*/

/*
 * Note: these routines perform what I believe is called the "bitap" algorithm;
 * it seems efficient, and makes it easy to support (restricted) transpositions, but
 * the drawback is that it only works with a standard edit distance
 *
 * it will be replaced by (actually simpler) code which has the same interface
 * implementing a general edit distance, which has worst case performance similar
 * to the dynamic programming algorithm
 *
 * this file predates altchars.dat; it substitutes "alternate" characters
 * at no cost (since that is easy to do)
 * 
 * Also, as a slight modification, we disallow replacing spaces with 
 * ordinary characters.
 *
 */

/* global includes */
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <stdio.h>
#include <math.h>

/* Libabase includes */
#include <abase_porting.h>
#include <abase_error.h>
#include <abase_memalloc.h>

/* local includes */
#include "lxcn_nfa.h"
#include "lxcn_input.h"

#define bits(n) ((1ULL<<(n))-1)
#define xp(reg) ((reg)+MAX_EDITDISTANCE)

static int nfa_setup_table(char *pattern, nfa_transition_table mask)
{
    int i, len = (int) strlen (pattern); /* MS: change to size_t? */

    if(len > 63)
	abs_abort("lxcn_nfa_setup_table", "input exceeds limit (to do)");

    memset(mask, 0, sizeof(nfa_transition_table));

    for(i=0; pattern[i]; i++) {
	/* Add the current pattern character to the mask */
        unsigned uc = pattern[i] & (ALPHABET_SIZE-1);
	if(lxcn_is_white_space(pattern[i])) uc = ' ';
	mask[uc] |= 1 << i;

        /* Also add its translation */
	uc = lxcn_translate(pattern[i]) & (ALPHABET_SIZE-1);
	mask[uc] |= 1 << i;
    }

    return len;
}

static void nfa_initialize(nfa_state reg, int max_distance)
{
    int i;

    if(max_distance > MAX_EDITDISTANCE)
	abs_abort("lxcn_nfa_initialize", "maximum edit distance too large");

    /* Initialize registers: 100.., 110.., 111.., ... 
       (The extra bits below the diagonal make life easier)
     */
    for(i=0; i <= max_distance; i++) {
	reg[i] = (2ULL << i) - 1;
    }
#if TRANSPOSITIONS
    for(i=1; i <= max_distance; i++) {
	xp(reg)[i] = 0;
    }
#endif
}

/* The external interface to the above */

void lxcn_nfa_create(char *pattern, int distance, AutomatonData *fsm, AutomatonState *init)
{
    fsm->width  = nfa_setup_table(pattern, fsm->mask);
    fsm->height = distance;

    nfa_initialize(init->reg, distance);
    init->info  = fsm;
}

AutomatonState *lxcn_nfa_deepcopy(AutomatonState *dest, AutomatonState *src)
{
    dest->info = src->info;
    memcpy(dest->reg, src->reg, sizeof(u_int64)*(src->info->height+1));
    return dest;
}

/* Update the NFA state depending on the given character.
   It is safe for fsm and fsmnew to alias. */

int lxcn_nfa_feed_char_copy(AutomatonState *fsm, char c, AutomatonState *fsmnew)
{
    int const dist      = fsm->info->height;
    int const len       = fsm->info->width;
    u_int64  const mask = fsm->info->mask[c & (ALPHABET_SIZE-1)];
    u_int64  const clip = bits(len+1);
    u_int64* reg        = fsm->reg;

    /* Small hack: disallow replacing spaces */
    u_int64  const repl = ~fsm->info->mask[' '];

    int i, best;

    fsmnew->info = fsm->info;

    /* Character match & insertion & replacement updates */
    for(i=dist; i>0 && reg[i-1]&clip; i--) {
#if TRANSPOSITIONS
	fsmnew->reg[i] = (reg[i] & mask) << 1 | reg[i-1] | (reg[i-1]&repl) << 1 | (xp(reg)[i] & mask) << 2;
        xp(fsmnew->reg)[i] = reg[i-1] & (mask >> 1);
#else
	fsmnew->reg[i] = (reg[i] & mask) << 1 | reg[i-1] | (reg[i-1]&repl) << 1;
#endif
    }
    fsmnew->reg[i] = (reg[i] & mask) << 1;

    /* Clear all empty registers */
    if(reg != fsmnew->reg) {
	int j;
#if TRANSPOSITIONS
	for(j=0; j<i; j++) fsmnew->reg[j] = xp(fsmnew->reg)[j] = 0;
#else
	for(j=0; j<i; j++) fsmnew->reg[j] = 0;
#endif
    }

    best = (reg[dist]&clip)? i : dist+1;
    reg = fsmnew->reg;

    /* Update character deletions & examine nfa state */
    while(++i<=dist) {
	reg[i] |= reg[i-1] << 1; 
    }

    return best;
}

/* Three ways of matching strings using a NFA;
   1) force-feed everything
   2) eat characters until we have had enough
   3) eat as much as possible                   */

int lxcn_nfa_feed_string(AutomatonState *fsm, char *str)
{
    while(*str) lxcn_nfa_feed_char(fsm, *str++);
    return lxcn_nfa_accepts(fsm);
}

int lxcn_nfa_feed_string_dump(AutomatonState *fsm, char *str)
{
    lxcn_nfa_dump(fsm);
    while(*str) { 
	printf("Fed(%c)\n", *str);
	lxcn_nfa_feed_char(fsm, *str++);
	lxcn_nfa_dump(fsm);
    }
    return lxcn_nfa_accepts(fsm);
}

char *lxcn_nfa_match_shortest(AutomatonState *fsm, char *str)
{
    if(*str == '\0' && lxcn_nfa_accepts(fsm))
 	return str;

    while(*str && !lxcn_nfa_rejects(fsm)) {
	lxcn_nfa_feed_char(fsm, *str++);
	if( lxcn_nfa_accepts(fsm) ) return str;
    }

    return NULL;
}

char *lxcn_nfa_match_longest(AutomatonState *fsm, char *str)
{
    if((str = lxcn_nfa_match_shortest(fsm,str))) {
	AutomatonState fsm2;
	do {
	    if(*str && (lxcn_nfa_feed_char_copy(fsm,  *str,&fsm2), lxcn_nfa_accepts(fsm))) str++; else return str;
	    if(*str && (lxcn_nfa_feed_char_copy(&fsm2,*str,fsm  ), lxcn_nfa_accepts(fsm))) str++; else break;
	} while(1);
	/* Update the original fsm */
	*fsm = fsm2;
    }
    return str;
}

/* Graphically represent the given NFA state (for debugging) */

void lxcn_nfa_dump(AutomatonState *fsm)
{
    int const dist      = fsm->info->height;
    int const len       = fsm->info->width;
    u_int64* const reg  = fsm->reg;
    int x1, x2, y1, y2;

    int i, j;
    for(i=0; i<=dist; i++) {
#if TRANSPOSITIONS
        if(i) {
	    u_int64 r = xp(reg)[i]; 
            printf("> ");
	    for(j=0; j<=len; j++) {
		printf("%c", r&1? '*':'.');
		r >>= 1;
	    }
	    printf("<\n");
        }
#endif
	u_int64 r = reg[i]; 
	printf("| ");
	for(j=0; j<=len; j++) {
	    printf("%c", r&1? '*':'.');
	    r >>= 1;
	}
        printf("|\n");
    }
    lxcn_nfa_get_min_pos(fsm, &x1, &y1);
    lxcn_nfa_get_max_pos(fsm, &x2, &y2);
    printf("[%d,%d..%d,%d]\n", x1,y1,x2,y2);
}

/* Determine whether a NFA has become totally deterministic; i.e. if there
   is only one active state and it is on the last row */

int lxcn_nfa_deterministic_pos(AutomatonState *fsm, int dist)
{
    /* Return true if only a single bit is flipped on in the entire fsm */
    int len      = fsm->info->width;
    u_int64 mask = fsm->reg[dist] & bits(len+1);

    /* Check if mask is a power of two */
    if(mask & (mask-1))
	return -1;

    if(dist > 0 && fsm->reg[dist-1] & bits(len+1))
	return -1;

    /* return ilogb(mask); */

    /* note that we know mask is a power of two */
    return ((mask & 0xFFFFFFFF00000000ULL) != 0) << 5
         | ((mask & 0xFFFF0000FFFF0000ULL) != 0) << 4
         | ((mask & 0xFF00FF00FF00FF00ULL) != 0) << 3
         | ((mask & 0xF0F0F0F0F0F0F0F0ULL) != 0) << 2
         | ((mask & 0xCCCCCCCCCCCCCCCCULL) != 0) << 1
         | ((mask & 0xAAAAAAAAAAAAAAAAULL) != 0);
}

static int log_2(u_int64 num)
{
    /* return ilogb(mask); */

    /* 
       But, because that seems to be unportable, we resort
       to this dirty hack.
     */
    int log, shift;
    num >>= shift = (num >= 1ULL<<32) << 5; log  = shift;
    num >>= shift = (num >= 1ULL<<16) << 4; log |= shift;
    num >>= shift = (num >= 1ULL<<8)  << 3; log |= shift;
    num >>= shift = (num >= 1ULL<<4)  << 2; log |= shift;
    num >>= shift = (num >= 1ULL<<2)  << 1; log |= shift;
    num >>= shift = (num >= 1ULL<<1)  << 0; log |= shift;
    return log;
}

/* Returns:
    in x, the maximum amount of input matched
    in y, the least distance at which this is reached */

void lxcn_nfa_get_max_pos(AutomatonState *fsm, int *x, int *y)
{
    int topbit, row = fsm->info->height;
    u_int64 mask = fsm->reg[row] & bits(fsm->info->width+1);

    if(!mask) {
	*x = -1, *y = row+1;
	return;
    }

    topbit = log_2(mask);
    mask = 1ULL << topbit;

    /* find least distance */
    while(row > 0 && fsm->reg[row-1] & mask) row--;

    *x = topbit, *y = row;
}

/* Returns:
    in y, the least distance at which this NFA is still active
    in x, the amount of characters matched at this distance */

void lxcn_nfa_get_min_pos(AutomatonState *fsm, int *x, int *y)
{
    u_int64 mask = bits(fsm->info->width+1);
    int i;

    /* find highest active row */
    for(i=0; i <= fsm->info->height; i++) {
	if(fsm->reg[i] & mask) {
	    *x = log_2(fsm->reg[i] & mask);
	    *y = i;
	    return;
	}
    }

    *x = -1, *y = i+1;
}

int lxcn_nfa_is_trivial(AutomatonState *fsm)
{
    int i;
    for(i=0; i <= fsm->info->height; i++) 
	if(fsm->reg[i] >= 2<<i) return 0;

    return 1;
}

/* not used; temporarily included */

int lxcn_nfa_compare(AutomatonState *a, AutomatonState *b)
{
    u_int64 mask = bits(a->info->width+1);
    int i;

    if(a->info != b->info)
	abs_abort("incomparable automata", "lxcn_nfa_compare");

#if 1
    for(i=0; i <= a->info->height; i++) {
	u_int64 x = a->reg[i] & mask;
	u_int64 y = b->reg[i] & mask;
	if(x < y) return -1; else
	if(x > y) return  1;
    }
#else
    mask = 1ULL << a->info->width;
    for(i=0; i <= a->info->height; i++) {
	u_int64 x = a->reg[i] & mask;
	u_int64 y = b->reg[i] & mask;
	if(!x && y) return -1; else
	if(x && !y) return  1;
    }
#endif
    return 0;
}

