%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%  $RCSfile: tokenization.pl,i $
%% $Revision: 1.0 $
%%     $Date: 2011/05/21 $
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%
%% Defines tokenization to be used for this grammar:
%%
%% Basic idea: Translate a string into words, taking any number 
%%    of spaces as delimiting words, e.g.
%%    "ab"     -> [ab]
%%    "ab cd"  -> [ab,cd]
%%    "ab  cd" -> [ab,cd]
%%
%%    Note that this is equivalent to
%%    [97,98]              -> [ab]
%%    [97,98,32,99,100]    -> [ab,cd]
%%    [97,98,32,32,99,100] -> [ab,cd]
%%
%% Specific cases:
%% - decapitalizes all capital letters
%% - removes the sentence internal punctuation symbols: ; ! , . ?  
%% - a ' ends a word and starts the next word (even if the last symbol)
%%   e.g. "John's" -> [john,'\'s']
%%        "Jons'" -> [Johns,'\'']
%% - treats sentence final . ! and ? as referring to the start symbols
%%   defined by decl_symbol/1, imp_symbol/1 and que_symbol/1;
%%   if none of these ends the sentence, root_symbol/1 is used
%%
%%   Author: Detmar Meurers 
%%           with decapitalization rules for Polish by Adam Przepiorkowski
%%           and idea to use sentence final punctuation by Stefan Mueller
%%


tokenize_sentence_string(String,Tokenlist,Desc):-
   tokenize_sentence_string_act(String,Tokenlist,Desc).

% tokenize_sentence_string_act(+String,-Tokenlist,-StartSymbolDesc)
tokenize_sentence_string_act([],[],Desc) :- 
   !,
   (  current_predicate(root_symbol/1) 
   -> root_symbol(Desc)
   ;  Desc = bot
   ).
tokenize_sentence_string_act([LastChar],[],Desc) :- 
   LastChar==fullstop,
   !,
   (  current_predicate(decl_symbol/1) 
   -> decl_symbol(Desc)
   ;  Desc = bot
   )
   ;
   LastChar==exclam,
   !,
   (  current_predicate(imp_symbol/1) 
   -> imp_symbol(Desc)
   ;  Desc = bot
   )
   ;
   LastChar==question,
   !,
   (  current_predicate(que_symbol/1) 
   -> que_symbol(Desc)
   ;  Desc = bot
   ).
tokenize_sentence_string_act(String,AtomList,StartSymb) :-
   tokenize_word_string(String,AtomList0,AtomList,RestString),
   tokenize_sentence_string_act(RestString,AtomList0,StartSymb).

tokenize_word_string(String,AtomList0,AtomList,RestString) :-
   tokenize_one_word(String,WordList,RestString),
   (  WordList == []
   -> AtomList = AtomList0
   ;  atom_codes(Atom,WordList),
      AtomList = [Atom|AtomList0]
   ).

tokenize_one_word([],[],[]).                  % end of string reached
tokenize_one_word([H|T],Word,Rest) :-
   (  H == 32                        % a) space ends a word
   -> Word=[],eliminate_spaces(T,Rest)  %    and remove additional spaces
   ;  (  H == 39                     % b) ' ends a word
      -> Word =[],Rest=[quote|T]     %    and leave quote at start of next word
      ;  (  sentence_final_punct(H,NewH) 
         -> Word=[],Rest=[NewH|T]
         ;  tokenize_one_char(H,Word0,Word), % else c), map one char
            tokenize_one_word(T,Word0,Rest)      %    and continue on word
         )
      )
   ).

sentence_final_punct(H,NewH) :-
   NewH=fullstop,
   atom_codes('.',[H]), !
   ;
   NewH=exclamation,
   atom_codes('!',[H]), !
   ;
   NewH=question,
   atom_codes('?',[H]).

tokenize_one_char(quote,L,[39|L]) :- % quote is output as '
   !.
% remove punctuation except for sentence ending .,?, and ! (treated above)
tokenize_one_char(Char,L,L) :-       
   (sentence_final_punct(_,Char)
   ;atom_codes('(',[Char])
   ;atom_codes(')',[Char])
   ;atom_codes('-',[Char])
   ;atom_codes(';',[Char])
   ;atom_codes(',',[Char])),
   !.
tokenize_one_char(Upper,L,[Lower|L]) :- % letters are decapitalized
   decapitalize_char(Upper,Lower),
   !.
tokenize_one_char(Char,L,[Char|L]).


eliminate_spaces([],[]).
eliminate_spaces([H|T],R) :-
   (  H == 32 
   -> eliminate_spaces(T,R)
   ;  [H|T]=R
   ).

% ------------------------------------------------------------------------
% Converting to lower caps: not the fastest code, but transparent

decapitalize_char(X,XDecap):-   % fails for non-cap characters
   atom_codes('A',[A]),
   atom_codes('Z',[Z]),
   atom_codes( a ,[Lower_a]),
   Diff is Lower_a - A,
   ( (A =< X, X =< Z)
   -> XDecap is X + Diff        % basic capital to lower letter conversion
   ; decap_special(X,XDecap)    % specials for other character sets
   ).

% Q -> ru in Saga western dialect
decap_special(X,Y) :-
   atom_codes('Q',[X]),!, 
   atom_codes(r,[Y]).

% German letters
decap_special(X,Y) :-           % Ä -> ä
   atom_codes('Ä',[X]),!, 
   atom_codes(ä,[Y]).
decap_special(X,Y) :-           % Ü -> ü
   atom_codes('Ü',[X]),!,             
   atom_codes(ü,[Y]).
decap_special(X,Y) :-           % Ö -> ö
   atom_codes('Ö',[X]),!,                     
   atom_codes(ö,[Y]).
% Polish letters added by Adam P.
decap_special(X,Y) :-           % ¡ -> ±
   atom_codes('¡',[X]),!, 
   atom_codes(±,[Y]).
decap_special(X,Y) :-           % Ê -> ê
   atom_codes('Ê',[X]),!, 
   atom_codes(ê,[Y]).
decap_special(X,Y) :-           % Æ -> æ
   atom_codes('Æ',[X]),!, 
   atom_codes(æ,[Y]).
decap_special(X,Y) :-           % Ñ -> ñ
   atom_codes('Ñ',[X]),!, 
   atom_codes(ñ,[Y]).
decap_special(X,Y) :-           % Ó -> ó
   atom_codes('Ó',[X]),!, 
   atom_codes(ó,[Y]).
decap_special(X,Y) :-           % ¦ -> ¶
   atom_codes('¦',[X]),!, 
   atom_codes(¶,[Y]).
decap_special(X,Y) :-           % ¬ -> ¼
   atom_codes('¬',[X]),!, 
   atom_codes(¼,[Y]).
decap_special(X,Y) :-           % ¯ -> ¿
   atom_codes('¯',[X]),!, 
   atom_codes(¿,[Y]).
decap_special(X,Y) :-           % £ -> ³
   atom_codes('£',[X]),!, 
   atom_codes(³,[Y]).