Lhogho  0.0.028
 All Data Structures Files Functions Variables Typedefs Macros Pages
parser.h File Reference

Go to the source code of this file.

Tokenization methods

#define TOKENIZE_DATA   0
 tokenize as data More...
 
#define TOKENIZE_COMMANDS   1
 tokenize as commands More...
 
#define TOKENIZE_READWORD   2
 tokenize as for READWORD function More...
 
#define TOKENIZE_READLIST   3
 tokenize as for READLIST function More...
 
#define ENBAR(ch)   (((unsigned char)ch)>127 ? (ch) : enbar[(unsigned char)ch])
 
#define DEBAR(ch)   ((ch)>31 ? (ch) : debar[(unsigned char)ch])
 
char_t enbar [128]
 table for a->|a| conversions More...
 
char_t debar [32]
 table for |a|->a conversions More...
 
void init_parser ()
 initializes parser More...
 
atom_t tokenize (atom_t input, int method)
 tokenizes into a list More...
 
atom_t trim_shell_comment (atom_t word)
 trims shell comment (if any) More...
 
atom_t purify (atom_t word)
 purifies a word More...
 
atom_t build_syntax_tree (atom_t function)
 parses body of user-defined function More...
 

Macro Definition Documentation

#define TOKENIZE_DATA   0

Definition at line 55 of file parser.h.

#define TOKENIZE_COMMANDS   1

Definition at line 56 of file parser.h.

#define TOKENIZE_READWORD   2

Definition at line 57 of file parser.h.

#define TOKENIZE_READLIST   3

Definition at line 58 of file parser.h.

#define ENBAR (   ch)    (((unsigned char)ch)>127 ? (ch) : enbar[(unsigned char)ch])

Definition at line 68 of file parser.h.

#define DEBAR (   ch)    ((ch)>31 ? (ch) : debar[(unsigned char)ch])

Definition at line 69 of file parser.h.

Function Documentation

void init_parser ( )

Initializes tables enbar[] and debar[] which are used to enbar and debar a character.

Definition at line 224 of file parser.c.

225 {
226  int i;
227 
228  // NOTE! if the string of enbarrable characters is
229  // changed, also change dump_word()
230  chars_t s = TEXT("______________()+-*/=<>|?_______");
231  //locked-> x........xx..x..................
232 
233  for( i=0; i<32; i++ ) debar[i]=i; // identity matrix
234  for( i=0; i<128; i++ ) enbar[i]=i; // identity matrix
235 
236  for( i=0; *s; i++,s++ )
237  {
238  if( *s==TEXT('_') ) continue;
239  debar[i] = *s;
240  enbar[(unsigned char)*s] = i;
241  }
242 }
atom_t tokenize ( atom_t  input,
int  method 
)
Parameters
inputword, subword or list to be tokenized
methodmethod of tokenization
Returns
tokenized list

Tokenizes a word, a subword or a list into a list. If possible makes all words as subwords. Backslashes and bars in words are preserved (i.e. words are not purified). Comments and line continuations are ignored.

If the method is TOKENIZE_DATA then the input is tokenized as if it contains Logo data. If the method is TOKENIZE_COMMANDS then the input is tokenized as if it contains Logo commands. If the method is TOKENIZE_READWORD then the input is tokenized as expected by READWORD function.

If the input is a list then all its elements are tokenized one-by-one.

Return value is the tokenized list. In there is an error, returns an error atom which error code is ERROR_INCOMPLETE_PAIR, error position points the position in the word (0-based) and the error source is the word itself.

Definition at line 275 of file parser.c.

276 {
277  if( IS_LIST(input) )
278  {
279  if( method==TOKENIZE_DATA &&
281  return USE(input);
282  if( method==TOKENIZE_COMMANDS &&
284  return USE(input);
285  }
286 
287  if( IS_FLOAT(input) )
288  {
289  return USE(input);
290  }
291 
292  //printf("\n");
293  //if(method==TOKENIZE_DATA)
294  // printf("ENTER TOKENIZE_DATA(");
295  //else
296  // printf("ENTER TOKENIZE_COMMANDS(");
297  //dump_atom(input,1);
298  //printf(")\n");
299 
300 
301  #ifdef SAFEMODE
302  assert( IS_WORD(input) || IS_SUBWORD(input) || IS_LIST(input) );
303  #endif
304 
305  // First check whether the input is a list.
306  // If it is then tokenize recursively all its elements.
307  if( IS_LIST(input) )
308  {
309  atom_t result = empty_list;
310  atom_t last = empty_list;
311  atom_t x;
312  atom_t y;
313  for( x=input; IS_NOT_EMPTY(x); x=CDR(x) )
314  {
315  //printf("@@@@@@@@PROCESSING=|"); dump_atom(CAR(x),1); printf("|\n");
316 
317  int submethod = IS_LIST(CAR(x))?TOKENIZE_DATA:method;
318  atom_t element = tokenize( CAR(x), submethod );
319  //printf("@@@@@@@@TOKENIZED_INTO=|"); dump_atom(element,1); printf("|\n");
320 
321  if( IS_ERROR(element) )
322  {
323  DEUSE( result );
324  //DEUSE( last );
325  result = element;
326  break;
327  }
328 
329  int initial_flags = GET_FLAGS( x, FLAG_NEWLINE|FLAG_AFTER_SPACE );
330  int final_flags = GET_FLAGS( x, FLAG_BEFORE_SPACE );
331 
332  //printf(">>>CAR(x) = "); dump_atom(CAR(x),1); printf("\n");
333  //printf(">>>element = "); dump_atom(element,1); printf("\n");
334  if( IS_FLOAT(element) )
335  {
336  append( USE(element), &result, &last );
337  SET_FLAGS( last, initial_flags|final_flags );
338  }
339  else if( IS_LIST(CAR(x)) )
340  {
341  if( IS_EXTENDED(x) )
342  {
343  append_ex( USE(element), &result, &last );
344  DEUSE( POS(last) );
345  POS( last ) = USE( POS(x) );
346  }
347  else
348  append( USE(element), &result, &last );
349  SET_FLAGS( last, initial_flags|final_flags );
350  }
351  else
352  {
353  for( y=element; IS_NOT_EMPTY(y); y=CDR(y) )
354  {
355  //printf("APPEND SUBELEMENT |");
356  //dump_atom(CAR(y),1);
357  //printf("|\n");
358 
359  if( IS_EXTENDED(y) )
360  {
361  append_ex( USE(CAR(y)), &result, &last );
362  DEUSE( POS(last) );
363  POS( last ) = USE( POS(y) );
364  }
365  else
366  append( USE(CAR(y)), &result, &last );
367  SET_FLAGS( last, FLAGS(y) );
368  if( y==element ) SET_FLAGS( last, initial_flags );
369  }
370  #ifdef SAFE_MODE
371  assert( IS_NOT_EMPTY(last) );
372  #endif
373  SET_FLAGS( last, final_flags);
374  }
375 
376  DEUSE( element );
377  }
378 
379  //printf("FINAL RESULT IS |"); dump_atom(result,1); printf("|\n");
380  return result;
381  }
382 
383  // The input is a word or a subword
384  chars_t source = STRING(input);
385  int len = LENGTH(input);
386  int origlen = len;
387 
388  chars_t buffer = ALLOC( CHAR_SIZE*len ); // buffer for the longest word
389  //chars_t bp = buffer;
390  char_t ch;
391 
392  int_t errpos = -1;
393  //char_t errchar = NULL_CHAR;
394 
395  int last_token = TOKEN_LINEEND;
396  //int crlf = 0;
397 
398  // Gets the next token. Return:
399  // TOKEN_END if there are no more tokens
400  // TOKEN_WORD if the token is a word
401  // TOKEN_DIRTY_WORD if the token is a word with \ or |
402  // TOKEN_OPEN if the token is [
403  // TOKEN_CLOSE if the token is ]
404  // TOKEN_LINEEND if the token is <nl>
405  // TOKEN_SPACE if at least one whitespace is met
406 
407 
408  //int co=0;
409  int get_token( atom_t *token, int method )
410  {
411  //co++;
412  //if (0 == co%1024)
413  //{
414  //printf("%d ",co);
415  //}
416  // return 1 if buffer contains number
417  int is_number(chars_t bp)
418  { // "E" {digit}* "." {digit}+
419  // 1 2 3 4
420  chars_t cp = bp;
421  int num_mode = 1;
422  char_t ch;
423 
424  cp = bp;
425 
426  if( bp==buffer ) return 0;
427 
428  while( cp>buffer )
429  {
430  cp--;
431  ch = *cp;
432  //printf("num_mode=%d ch=%C\n",num_mode,ch);
433  switch( num_mode )
434  {
435  case 1:
436  if( ch!=TEXT('E') && ch!=TEXT('e') ) return 0;
437  num_mode = 2;
438  break;
439  case 2: ;
440  if( ch<TEXT('0') || ch>TEXT('9') ) return 0;
441  num_mode = 3;
442  break;
443  case 3: ;
444  if( ch!=TEXT('.') ) return 0;
445  num_mode = 4;
446  break;
447  case 4: ;
448  if( ch<TEXT('0') || ch>TEXT('9') ) return 0;
449  break;
450  }
451  //printf("num_mode=%d\n",num_mode);
452  }
453  return 1;
454  }
455 
456  if( !len ) return TOKEN_END;
457 
458  int dirty = 0;
459  int mutated = 0;
460  chars_t bp = buffer; *bp=NULL_CHAR;
461  chars_t sp = source;
462 
463  int mode = MODE_ENTRY; // current mode
464  int code; // action code
465 
466 
467  int stack[MAX_MODE]; // return-to-mode for each mode
468  static int mode_eof[MAX_MODE] =
469  {
470  /* entry */ PAT_TOKEN_END,
471  /* whitespace */ PAT_TOKEN_SPACE,
472  /* word */ PAT_TOKEN_WORD,
473  /* barred */ PAT_ERROR,
474  /* backslashed */ PAT_ERROR,
475  /* tilde */ PAT_ERROR,
476  /* semitilde */ PAT_ERROR,
477  /* semicolon */ PAT_RETURN,
478  /* tildespace */ PAT_ERROR,
479  /* less */ PAT_TOKEN_WORD,
480  /* greater */ PAT_TOKEN_WORD
481  };
482  static int mode_eol[MAX_MODE] =
483  {
484  /* entry */ PAT_NEXT+PAT_TOKEN_LINE,
485  /* whitespace */ PAT_TOKEN_SPACE,
486  /* word */ PAT_TOKEN_WORD,
487  /* barred */ PAT_PUSH+PAT_NEXT,
488  /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
489  /* tilde */ PAT_NEXT+PAT_RETURN,
490  /* semitilde */ PAT_NEXT+PAT_RETURN,
491  /* semicolon */ PAT_RETURN,
492  /* tildespace */ PAT_NEXT+PAT_RETURN,
493  /* less */ PAT_TOKEN_WORD,
494  /* greater */ PAT_TOKEN_WORD
495  };
496  static int mode_space[MAX_MODE] =
497  {
498  /* entry */ PAT_NEXT+PAT_GOTO_SPACE,
499  /* whitespace */ PAT_NEXT,
500  /* word */ PAT_TOKEN_WORD,
501  /* barred */ PAT_PUSH+PAT_NEXT,
502  /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
503  /* tilde */ PAT_NEXT+PAT_GOTO_TILDESPACE,
504  /* semitilde */ PAT_NEXT,
505  /* semicolon */ PAT_NEXT,
506  /* tildespace */ PAT_NEXT,
507  /* less */ PAT_TOKEN_WORD,
508  /* greater */ PAT_TOKEN_WORD
509  };
510  static int mode_open[MAX_MODE] =
511  {
512  /* entry */ PAT_NEXT+PAT_TOKEN_OPEN,
513  /* whitespace */ PAT_TOKEN_SPACE,
514  /* word */ PAT_TOKEN_WORD,
515  /* barred */ PAT_PUSH+PAT_NEXT,
516  /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
517  /* tilde */ PAT_TILDE+PAT_RETURN,
518  /* semitilde */ PAT_RETURN,
519  /* semicolon */ PAT_NEXT,
520  /* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
521  /* less */ PAT_TOKEN_WORD,
522  /* greater */ PAT_TOKEN_WORD
523  };
524  static int mode_close[MAX_MODE] =
525  {
526  /* entry */ PAT_NEXT+PAT_TOKEN_CLOSE,
527  /* whitespace */ PAT_TOKEN_SPACE,
528  /* word */ PAT_TOKEN_WORD,
529  /* barred */ PAT_PUSH+PAT_NEXT,
530  /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
531  /* tilde */ PAT_TILDE+PAT_RETURN,
532  /* semitilde */ PAT_RETURN,
533  /* semicolon */ PAT_NEXT,
534  /* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
535  /* less */ PAT_TOKEN_WORD,
536  /* greater */ PAT_TOKEN_WORD
537  };
538  static int mode_bar[MAX_MODE] =
539  {
541  /* whitespace */ PAT_TOKEN_SPACE,
543  /* barred */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
544  /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
545  /* tilde */ PAT_TILDE+PAT_RETURN,
546  /* semitilde */ PAT_RETURN,
548  /* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
549  /* less */ PAT_TOKEN_WORD,
550  /* greater */ PAT_TOKEN_WORD
551  };
552  static int mode_backslash[MAX_MODE] =
553  {
555  /* whitespace */ PAT_TOKEN_SPACE,
558  /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
559  /* tilde */ PAT_TILDE+PAT_RETURN,
560  /* semitilde */ PAT_RETURN,
561  /* semicolon */ PAT_NEXT2, //+PAT_RETURN_TO_SELF+PAT_GOTO_BACKSLASHED,
562  /* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
563  /* less */ PAT_TOKEN_WORD,
564  /* greater */ PAT_TOKEN_WORD
565  };
566  static int mode_tilde[MAX_MODE] =
567  {
569  /* whitespace */ PAT_TOKEN_SPACE,
571  /* barred */ PAT_PUSH+PAT_NEXT,
572  /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
573  /* tilde */ PAT_TILDE+PAT_RETURN,
574  /* semitilde */ PAT_RETURN,
576  /* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
577  /* less */ PAT_TOKEN_WORD,
578  /* greater */ PAT_TOKEN_WORD
579  };
580  static int mode_semicolon[MAX_MODE] =
581  {
583  /* whitespace */ PAT_TOKEN_SPACE,
585  /* barred */ PAT_PUSH+PAT_NEXT,
586  /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
587  /* tilde */ PAT_TILDE+PAT_RETURN,
588  /* semitilde */ PAT_RETURN,
589  /* semicolon */ PAT_NEXT+PAT_RETURN_TO_SELF+PAT_GOTO_SEMITILDE,
590  /* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
591  /* less */ PAT_TOKEN_WORD,
592  /* greater */ PAT_TOKEN_WORD
593  };
594  static int mode_else[MAX_MODE] =
595  {
596  /* entry */ PAT_PUSH+PAT_NEXT+PAT_GOTO_WORD,
597  /* whitespace */ PAT_TOKEN_SPACE,
598  /* word */ PAT_PUSH+PAT_NEXT,
599  /* barred */ PAT_PUSH+PAT_NEXT,
600  /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
601  /* tilde */ PAT_TILDE+PAT_RETURN,
602  /* semitilde */ PAT_RETURN,
603  /* semicolon */ PAT_NEXT,
604  /* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
605  /* less */ PAT_TOKEN_WORD,
606  /* greater */ PAT_TOKEN_WORD
607  };
608  static int mode_parens[MAX_MODE] =
609  {
610  /* entry */ PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD,
611  /* whitespace */ PAT_TOKEN_SPACE,
612  /* word */ PAT_TOKEN_WORD,
613  /* barred */ PAT_PUSH+PAT_NEXT,
614  /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
615  /* tilde */ PAT_TILDE+PAT_RETURN,
616  /* semitilde */ PAT_RETURN,
617  /* semicolon */ PAT_NEXT,
618  /* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
619  /* less */ PAT_TOKEN_WORD,
620  /* greater */ PAT_TOKEN_WORD
621  };
622  static int mode_equal[MAX_MODE] =
623  {
624  /* entry */ PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD,
625  /* whitespace */ PAT_TOKEN_SPACE,
626  /* word */ PAT_TOKEN_WORD,
627  /* barred */ PAT_PUSH+PAT_NEXT,
628  /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
629  /* tilde */ PAT_TILDE+PAT_RETURN,
630  /* semitilde */ PAT_RETURN,
631  /* semicolon */ PAT_NEXT,
632  /* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
633  /* less */ PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD,
634  /* greater */ PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD
635  };
636  static int mode_less[MAX_MODE] =
637  {
638  /* entry */ PAT_PUSH+PAT_NEXT+PAT_GOTO_LESS,
639  /* whitespace */ PAT_PUSH+PAT_NEXT+PAT_GOTO_LESS, //PAT_TOKEN_WORD, @boza
640  /* word */ PAT_TOKEN_WORD,
641  /* barred */ PAT_PUSH+PAT_NEXT,
642  /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
643  /* tilde */ PAT_TILDE+PAT_RETURN,
644  /* semitilde */ PAT_RETURN,
645  /* semicolon */ PAT_NEXT,
646  /* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
647  /* less */ PAT_TOKEN_WORD,
648  /* greater */ PAT_TOKEN_WORD
649  };
650  static int mode_greater[MAX_MODE] =
651  {
652  /* entry */ PAT_PUSH+PAT_NEXT+PAT_GOTO_GREATER,
653  /* whitespace */ PAT_PUSH+PAT_NEXT+PAT_GOTO_GREATER, //PAT_TOKEN_WORD, @boza
654  /* word */ PAT_TOKEN_WORD,
655  /* barred */ PAT_PUSH+PAT_NEXT,
656  /* backslashed */ PAT_PUSH+PAT_NEXT+PAT_RETURN,
657  /* tilde */ PAT_TILDE+PAT_RETURN,
658  /* semitilde */ PAT_RETURN,
659  /* semicolon */ PAT_NEXT,
660  /* tildespace */ PAT_TILDE+PAT_TOKEN_WORD,
661  /* less */ PAT_PUSH+PAT_NEXT+PAT_TOKEN_WORD,
662  /* greater */ PAT_TOKEN_WORD
663  };
664 
665 
666 again:
667  // get action code
668  GET_CHAR;
669 
670 
671 #ifdef DEBUG_TOKENS
672  printf("<TOKENS> length=%d\n",len);
673  if(ch<TEXT(' '))
674  printf("<TOKENS> get(#%d)\t",ch);
675  else
676  printf("<TOKENS> get('%c')\t",ch);
677  switch( mode )
678  {
679  case MODE_ENTRY: printf(" ENTRY -> "); break;
680  case MODE_SPACE: printf(" SPACE -> "); break;
681  case MODE_WORD: printf(" WORD -> "); break;
682  case MODE_BARRED: printf(" BARRED -> "); break;
683  case MODE_BACKSLASHED: printf(" BACKSLASH -> "); break;
684  case MODE_TILDE: printf(" TILDE -> "); break;
685  case MODE_SEMITILDE: printf(" SEMITILDE -> "); break;
686  case MODE_SEMICOLON: printf(" SEMICOLON -> "); break;
687  case MODE_TILDESPACE: printf("TILDESPACE -> "); break;
688  }
689 #endif
690  code = mode_else[mode];
691  if( !len )
692  code = mode_eof[mode];
693  else
694  {
695  if( method==TOKENIZE_READWORD )
696  { // tokenize as expected by READWORD
697  if( ch==TEXT('|') ) code = mode_bar[mode];
698  else if( ch==TEXT('\\') ) code = mode_backslash[mode];
699  }
700  else
701  if( method==TOKENIZE_READLIST )
702  { // tokenize as data
703  if( ch==TEXT('\n') ) code = mode_eol[mode];
704  else if( ch==TEXT('\r') ) code = mode_eol[mode];
705  else if( ch<=TEXT(' ') ) code = mode_space[mode];
706  else if( ch==TEXT('[') ) code = mode_open[mode];
707  else if( ch==TEXT(']') ) code = mode_close[mode];
708  else if( ch==TEXT('|') ) code = mode_bar[mode];
709  else if( ch==TEXT('\\') ) code = mode_backslash[mode];
710  else if( ch==TEXT('~') ) code = mode_tilde[mode];
711  }
712  else
713  if( method==TOKENIZE_DATA )
714  { // tokenize as data
715  if( ch==TEXT('\n') ) code = mode_eol[mode];
716  else if( ch==TEXT('\r') ) code = mode_eol[mode];
717  else if( ch<=TEXT(' ') ) code = mode_space[mode];
718  else if( ch==TEXT('[') ) code = mode_open[mode];
719  else if( ch==TEXT(']') ) code = mode_close[mode];
720  else if( ch==TEXT('|') ) code = mode_bar[mode];
721  else if( ch==TEXT('\\') ) code = mode_backslash[mode];
722  else if( ch==TEXT('~') ) code = mode_tilde[mode];
723  else if( ch==TEXT(';') ) code = mode_semicolon[mode];
724  }
725  else
726  { // tokenize as commands
727  //printf("ch=%c *buf=%c\n",ch,*buffer);
728  if( ch==TEXT('(') ||
729  ch==TEXT(')') ) code = mode_parens[mode];
730  else if( *buffer!=TEXT('"') )
731  {
732  if( (ch==TEXT('+') || ch==TEXT('-')) && is_number(bp) ) { }
733  else if( ch==TEXT('+') ||
734  ch==TEXT('-') ||
735  ch==TEXT('*') ||
736  ch==TEXT('/')) code = mode_parens[mode];
737  else if( ch==TEXT('=') ) code = mode_equal[mode];
738  else if( ch==TEXT('<') ) code = mode_less[mode];
739  else if( ch==TEXT('>') ) code = mode_greater[mode];
740  }
741  }
742  }
743  // process action code
744  int _stack = -1;
745 
746  int newmode = 0;
747  if( code & PAT_GOTO )
748  {
749  // this is pred-processing of GOTO
750  // if old mode was space, and new is not,
751  // then update initial position of next token
752  newmode = (code>>PAT_SHIFT) & 0xF;
753  if( mode==MODE_SPACE && newmode!=MODE_SPACE ) { sp = source; }
754  }
755 
756 
757  if( code & PAT_TILDE )
758  {
759  *bp++ = TEXT('~');
760  #ifdef DEBUG_TOKENS
761  printf("\n<TOKENS> put('%c'/%d)\n",TEXT('~'),TEXT('~'));
762  #endif //DEBUG_TOKENS
763  }
764  if( code & PAT_PUSH )
765  {
766  // push a character only if:
767  // - currently not in bars |..?..|
768  // - currently in bars, but not in semicolon ;..|..?..|
769  if( mode!=MODE_BARRED ||
770  (stack[mode]!=MODE_SEMITILDE &&
771  stack[mode]!=MODE_SEMICOLON) )
772  {
773  //if( mode==MODE_BARRED || mode==MODE_BACKSLASHED )
774  //*bp++ = ENBAR(ch);
775  //else
776  *bp++ = ch;
777  #ifdef DEBUG_TOKENS
778  printf("\n<TOKENS> put('%c'/%d)\n",ch,ch);
779  #endif //DEBUG_TOKENS
780  //if( ch=='\r' && *(source+1)=='\n' ) // handle CRLF cases
781  //{
782  //*bp++ = '\n';
783  //#ifdef DEBUG_TOKENS
784  // printf("\n<TOKENS> put('%d')\n",'\n');
785  // #endif //DEBUG_TOKENS
786  //}
787  }
788  }
789  if( code & PAT_NEXT2 )
790  {
791  source++;
792  len--;
793  }
794  if( code & (PAT_NEXT|PAT_NEXT2) )
795  {
796  if( *source=='\r' )
797  {
798  //crlf = 0;
799  if( *(source+1)=='\n' )
800  {
801  //crlf = 1;
802  source++;
803  len--;
804  }
805  }
806  source++;
807  len--;
808  }
809  if( code & PAT_DIRTY ) dirty = 1;
810  if( code & PAT_MUTATED ) mutated = 1;
811  if( code & PAT_ERROR_POS ) errpos = origlen-len-1;
812  if( code & PAT_RETURN_TO_WORD ) _stack = MODE_WORD;
813  if( code & PAT_RETURN_TO_SELF ) _stack = mode;
814  if( code & PAT_RETURN_TO_CALLER ) _stack = stack[mode];
815  if( code & PAT_GOTO )
816  {
817  // this is post-processing of GOTO
818  mode = newmode;
819  stack[mode] = _stack;
820  }
821  if( code & PAT_RETURN ) mode = stack[mode];
822  if( code & PAT_TOKEN )
823  {
824  int _token = (code>>PAT_SHIFT) & 0xF;
825  if( _token!=TOKEN_WORD ) return _token;
826  if( mutated )
827  {
828  *bp = NULL_CHAR;
829  *token = new_word( buffer, UNKNOWN );
830  #ifdef DEBUG_TOKENS
831  printf("MUTATED TOKEN "); dumpln(*token);
832  printf("\n\n");
833  #endif
834  }
835  else
836  {
837  *token = new_subword( input, sp, source-sp /*bp-buffer*/ );
838  #ifdef DEBUG_TOKENS
839  printf("NORMAL TOKEN **"); dump(*token);
840  printf("** (len=%d)\n\n\n",source-sp);
841  #endif
842  }
843  return dirty?TOKEN_DIRTY_WORD:TOKEN_WORD;
844  }
845  if( code & PAT_ERROR )
846  {
847  #ifdef DEBUG_TOKENS
848  printf("ERROR\n");
849  #endif
850  return TOKEN_ERROR;
851  }
852 
853 #ifdef DEBUG_TOKENS
854  switch( mode )
855  {
856  case MODE_ENTRY: printf("ENTRY\n"); break;
857  case MODE_SPACE: printf("SPACE\n"); break;
858  case MODE_WORD: printf("WORD\n"); break;
859  case MODE_BARRED: printf("BARRED\n"); break;
860  case MODE_BACKSLASHED: printf("BACKSLASH\n"); break;
861  case MODE_TILDE: printf("TILDE\n"); break;
862  case MODE_SEMITILDE: printf("SEMITILDE\n"); break;
863  case MODE_SEMICOLON: printf("SEMICOLON\n"); break;
864  case MODE_TILDESPACE: printf("TILDESPACE\n"); break;
865  }
866 #endif
867  goto again;
868  } // get_token()
869 
870 
871  atom_t get_sublist( int level, int full_parse, atom_t* pos ) //ex2//
872  {
873  atom_t result = empty_list;
874  atom_t last = empty_list;
875  if( pos ) *pos = NULL; //ex2//
876 
877  atom_t token = NULL;
878  atom_t sublist_pos = NULL; //ex2//
879  int flags;
880  int bracketlen = len; // LEN of last opening bar
881 
882  int pos_from = source-STRING(input); //ex2//
883  if( pos_from ) pos_from--; //ex2//
884 
885  flags = 0;//FLAG_NEWLINE;
886  while( (last_token=get_token(&token,full_parse)) )
887  {
888  sublist_pos = NULL;
889 
890  #ifdef DEBUG_TOKENIZATION
891  switch(last_token)
892  {
893  case TOKEN_END: printf("TOKEN_END\n"); break;
894  case TOKEN_SPACE: printf("TOKEN_SPACE\n"); break;
895  case TOKEN_WORD: printf("TOKEN_WORD @"); dump(token); printf("@\n"); break;
896  case TOKEN_DIRTY_WORD: printf("TOKEN_|WORD| @"); dump(token); printf("@\n"); break;
897  case TOKEN_OPEN: printf("TOKEN_OPEN [\n"); break;
898  case TOKEN_CLOSE: printf("TOKEN_CLOSE ]\n"); break;
899  case TOKEN_LINEEND: printf("TOKEN_LINEEND\n"); break;
900  case TOKEN_ERROR: printf("TOKEN_ERROR\n"); break;
901  }
902  #endif
903 
904  if( last_token==TOKEN_ERROR ) return result;
905  if( last_token==TOKEN_SPACE )
906  {
907  if( IS_NOT_EMPTY(last) ) SET_FLAGS( last, FLAG_BEFORE_SPACE );
908  flags |= FLAG_AFTER_SPACE;
909  continue;
910  }
911  if( last_token==TOKEN_DIRTY_WORD )
912  {
913  //printf("###BEFORE="); dumpln(token);
914  atom_t x = purify( token );
915  DEUSE( token );
916  token = x;
917  //printf("###AFTER="); dumpln(token);
918  }
919  if( last_token==TOKEN_CLOSE ) break;
920  if( last_token==TOKEN_LINEEND )
921  {
922  flags |= FLAG_NEWLINE;
923  continue;
924  }
925  if( last_token==TOKEN_OPEN)
926  {
927  token = get_sublist( level+1, TOKENIZE_DATA, &sublist_pos ); // recursive //ex2//
928 
929  if( last_token==TOKEN_ERROR )
930  {
931  DEUSE( token );
932  if( sublist_pos ) DEUSE(sublist_pos);
933  sublist_pos = NULL;
934  break;
935  }
936  }
937 
938  if( method==TOKENIZE_COMMANDS )
939  flags |= FLAG_TOKENIZED_COMMANDS;
940 
941  if( method==TOKENIZE_DATA )
942  flags |= FLAG_TOKENIZED_DATA;
943 
944  if( method==TOKENIZE_COMMANDS &&
945  last_token==TOKEN_WORD &&
946  LENGTH(token)>1 &&
947  *STRING(token)==TEXT('?') &&
948  *(STRING(token)+1)>=TEXT('0') &&
949  *(STRING(token)+1)<=TEXT('9') )
950  {
951  //printf(">>>%d %d\n", last_token==TOKEN_WORD, last_token==TOKEN_DIRTY_WORD);
952  // process template ?nn->(? nn) for command tokenization
953  //printf("append token **"); dump(token); printf("**\n");
954  atom_t new_qoken = new_subword( token, STRING(token), 1 );
955  atom_t new_token = new_subword( token, STRING(token)+1, LENGTH(token)-1 );
956  DEUSE( token );
957 
958  append( new_word(TEXT("("),-1), &result, &last ); // (
959  append( new_qoken, &result, &last ); // ?
960  append( new_token, &result, &last ); // nn
961  append( new_word(TEXT(")"),-1), &result, &last ); // )
962  }
963  else
964  {
965  // normal token, no more processing needed
966  if( sublist_pos ) //ex//
967  {
968  append_ex( token, &result, &last );
969  POS( last ) = sublist_pos;
970 
971  //printf("\n\nSET EXTENDED POSITION ");
972  //dump_atom(sublist_pos,1); printf("\n");
973  //printf("CURRENT RESULT ");
974  //dump_atom(result,1); printf("\n\n");
975 
976  sublist_pos = NULL;
977  }
978  else
979  {
980  append( token, &result, &last );
981  }
982  SET_FLAGS( last, flags );
983  }
984 
985  flags = 0;
986  if( last_token==TOKEN_ERROR ) break;
987  }
988 
989  // test for unmatching square brackets
990  // i.e. ...[... or ...]...
991  if( level )
992  {
993  if( last_token==TOKEN_END )
994  {
995  errpos = origlen-bracketlen-1;
996  last_token = TOKEN_ERROR;
997  }
998  }
999  else
1000  {
1001  if( last_token==TOKEN_CLOSE )
1002  {
1003  errpos = origlen-len-1;
1004  last_token = TOKEN_ERROR;
1005  }
1006  }
1007 
1008  int pos_to = source-STRING(input);
1009  if( pos_to ) pos_to--;
1010 
1011  if( pos && method == TOKENIZE_DATA ) //ex2//
1012  {
1013  *pos = new_subword( input, STRING(input)+pos_from, pos_to-pos_from+1 );
1014  }
1015 
1016  return result;
1017  } // get_sublist()
1018 
1019  atom_t result = get_sublist( 0, method, NULL ); //ex//
1020 
1021  // in case of error return empty list
1022  if( last_token==TOKEN_ERROR )
1023  {
1024  //printf("ERROR RESULT=");dumpln(result);
1025  DEUSE( result );
1026  result = new_parse_error( ERROR_INCOMPLETE_PAIR, errpos, input );
1027  }
1028 
1029 
1030  DEALLOC( buffer );
1031  //printf("#########");
1032  //dump_atom(result,1);
1033  //printf("######\n");
1034 
1035  return result;
1036 }
atom_t trim_shell_comment ( atom_t  word)
Parameters
wordword containing source text
Returns
atom with the source text with trimmed shell comment

Trims a shell comment from the beginning of the word. Shell comment can be only the first line if its first two characters are #!. If a shell comment is trimmed, then the result is a subword from the first character on the second line, otherwise the input word is returned as is but with increased reference count.

Definition at line 1055 of file parser.c.

1056 {
1057  #ifdef SAFEMODE
1058  assert( IS_WORD(word) || IS_SUBWORD(word) );
1059  #endif
1060 
1061  chars_t source = STRING(word);
1062  int_t len = LENGTH(word);
1063 
1064  // if there are no enough characters just exit
1065  if( LENGTH(word)<2 ) return USE(word);
1066 
1067  // if the first two characters are not #! then exit
1068  if( *source!=TEXT('#') || *(source+1)!=TEXT('!') ) return USE(word);
1069 
1070  // skip the line
1071  while( len && *source!=TEXT('\n') )
1072  {
1073  source++;
1074  len--;
1075  }
1076 
1077  // return a subword. Pay attention to always reference
1078  // the main host word because the input could be a word
1079  // or a subword.
1080  if( IS_WORD(word) )
1081  return new_subword( word, source, len );
1082  else
1083  return new_subword( WORD(word), source, len );
1084 }
atom_t purify ( atom_t  word)
Parameters
wordword to be purified
Returns
purified word

Purifies a word by processing all backslashes and bars. Returns a new word if needed. Assumes that the input needs purification.

Definition at line 1099 of file parser.c.

1100 {
1101  //return USE(word);
1102  #ifdef SAFEMODE
1103  assert( IS_WORD(word) || IS_SUBWORD(word) );
1104  #endif
1105 
1106  chars_t source = STRING(word);
1107  int_t len = LENGTH(word);
1108 
1109  chars_t buffer = alloca( CHAR_SIZE*len ); // buffer for the longest word
1110  chars_t bp = buffer;
1111 
1112  int need_enbar = 0;
1113  int is_mutated = 0; // set to 1 if the word is mutated
1114  int in_backslash = 0;
1115  int in_bars = 0;
1116  for( ; len; len--,source++ )
1117  {
1118  need_enbar = in_bars || in_backslash;
1119  if( in_backslash )
1120  {
1121  in_backslash = 0;
1122  }
1123  else if( *source==TEXT('\\') )
1124  {
1125  is_mutated = 1;
1126  in_backslash = 1;
1127  continue;
1128  }
1129  else if( *source==TEXT('|') )
1130  {
1131  is_mutated = 1;
1132  in_bars = !in_bars;
1133  continue;
1134  }
1135  if( need_enbar )
1136  *bp++ = ENBAR(*source);
1137  else
1138  *bp++ = *source;
1139 
1140  //if( need_enbar )
1141  //printf(" PURIFY %d %d\n",*source,ENBAR(*source));
1142  //else
1143  //printf(" PURIFY %d \n",*source );
1144  }
1145 
1146  *bp = NULL_CHAR;
1147 
1148  if( is_mutated )
1149  return new_word( buffer, bp-buffer );
1150  else
1151  return USE(word);
1152 }
atom_t build_syntax_tree ( atom_t  func)
Parameters
funcvar atom for the parse context
Returns
empty_list or an error atom

Parses completely a function. Its source is stored in its body as word, subword, data-tokenized list or command-tokenized list. Building algorithm:

  • tokenization of body as commands
  • extracting all TO ... ENDs and create them as subfunctions
  • parsing the func's body into abstract syntax tree
  • recursively build trees of subfunctions

Definition at line 2278 of file parser.c.

2279 {
2280  // exit is function is already treefied
2281  if( IS_NOT_EMPTY(TREE(func)) ) return empty_list;
2282 
2283  //printf("BUILD_SYNTAX_TREE(FUNC=");
2284  //dump(NAME(func));
2285  //printf(",SOURCE=");
2286  //dump(SOURCE(func));
2287  //printf(",BODY=");
2288  //dump(BODY(func));
2289  //printf(",LOCALS=");
2290  //dump(LOCALS(func));
2291  //printf(")\n\n");
2292 
2293  if( IS_EMPTY(BODY(func)) )
2294  {
2295  // Step 1. Tokenize
2296  //printf("SOURCE="); dumpln(SOURCE(func));
2297  atom_t tokens1 = tokenize( SOURCE(func), TOKENIZE_DATA );
2298  if( IS_ERROR(tokens1) ) return tokens1;
2299  //printf("TOKENS1="); dumpln(tokens1);
2300 
2301  atom_t tokens2 = tokenize( tokens1, TOKENIZE_COMMANDS );
2302  DEUSE( tokens1 );
2303  if( IS_ERROR(tokens2) ) return tokens2;
2304  //printf("TOKENS2="); dumpln(tokens2);
2305 
2306  // Step 2. Extract TO..END's
2307  atom_t body = preparse( tokens2, func, LEVEL(func) );
2308  if( IS_ERROR(body) ) return body;
2309  DEUSE( BODY(func) );
2310  BODY(func) = body;
2311  //printf("BODY="); dumpln(BODY(func));
2312  }
2313 
2314 
2315  // Step 3. Parse function body
2316  //printf("BODY="); dumpln(BODY(func));
2317  atom_t tree = parse( BODY(func), func, 1 );
2318  if( IS_ERROR(tree) ) return tree;
2319  DEUSE(TREE(func));
2320  TREE(func) = tree;
2321  //printf("TREE="); dumpln(TREE(func));
2322 
2323  // because the might be some new TO..ENDs
2324  // scan all locals and build those which
2325  // have no trees
2326  atom_t local;
2327  atom_t locals;
2328  for( locals=LOCALS(func); IS_NOT_EMPTY(locals); locals=CDR(locals) )
2329  {
2330  local = CAR(locals);
2331  if( !DESCR2(local) ) continue;
2332  atom_t x = build_syntax_tree( local );
2333  if( IS_ERROR(x) ) return x;
2334  }
2335 
2336  return empty_list;
2337 }

Variable Documentation

char_t enbar[128]
char_t debar[32]

Definition at line 211 of file parser.c.


[ HOME | INDEX | ATOMS | VARS | REFERENCE ]
Lhogho Developer's Documentation
Wed Jul 10 2013