/*
   File: ebase_input.h
   Defines some basic input stuff

   Copyright (C) 2011 Marc Seutter

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id: ebase_input.h,v 1.13 2013/03/13 10:06:43 marcs Exp $"
*/
#ifndef IncEbaseInput
#define IncEbaseInput

#include "ebase_lexicon.h"

/*
   The lexicon generator will put lexemes with different markers in
   different tries. Such a marker will no longer be present in the lexemes. 
   The markers use the following bits.

   bit 0: Prefix
   bit 1: Suffix
   bit 2: Multi Word
   bit 3: Literal Word

   Note that an infix is formed by the Prefix/Suffix OR;
   Furthermore we introduce a special character coded as 0xAD (Extended
   ASCII Soft Hyphen), which is not a valid UTF-8 encoding, to indicate
   matching with '-', '-' + white space or absence of '-'. The example
   for this case is well-known, which should match wellknown, well-known
   or well-<WS>known
*/
#define SoftHyphenChar 0xAD
#define LexemePrefixBit 0x1
#define LexemeSuffixBit 0x2
#define LexemeInfix 0x3
#define LexemeMultiWordBit 0x4
#define LexemeLiteralBit 0x8

/* Input modes */
#define LineInputMode 0
#define ParagraphInputMode 1
#define DocumentInputMode 2

/* Translations */
#define TranslationsOff 0
#define TranslationsDefault 1
#define TranslationsByFile 2

/* Regexp NFA kinds, bit coded */
#define RegexpMatch 1
#define RegexpSkip 2
#define RegexpAll 0x3

/*
   Transition Kinds, so that compiler and runtime system talk about the same.
   A lookahead set consists of a minimum state phase in the trellis to test
   the lookahead set plus a sorted list of pairs of transition class and number
*/
#define TransError 0x00
#define TransTerminal 0x01
#define TransLexNont 0x02
#define TransWord 0x03
#define TransMatch 0x04
#define TransSkip 0x08
#define TransOther 0x10
#define TransAny 0x1F
#define TransWhiteSpace 0x20
#define TransEndOfText 0x40

/* Define a penalty as a 32 bit integer (to be future proof) */
typedef int32 Penalty;

/* Define a shorthand for unsigned chars */
typedef unsigned char uchar;

void ebs_prepare_lexicon_input (Lexicon lex);
int ebs_is_utf8_char (char **input, int *ret_val);
char *ebs_skip_one_char (Lexicon lex, char *input);
int ebs_is_separator (Lexicon lex, char **input);
int ebs_ahead_separator (Lexicon lex, char *input);
int ebs_past_separator (Lexicon lex, char *input);
int ebs_is_white_space (Lexicon lex, char **input);
int ebs_ahead_white_space (Lexicon lex, char *input);
int ebs_is_translation (Lexicon lex, char **input,
                        char *rch, int *rvalue, char **rstr, Penalty *rpenalty);
int ebs_is_white_spaces (Lexicon lex, char **input);
void ebs_update_position (Lexicon lex, char *from, char *to, int *line, int *col);

#endif /* IncEbaseInput */
