summaryrefslogtreecommitdiff
path: root/lexer.mll
blob: 88081cdc0f8b63560dcc4b979cc922a1cde36c77 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

(* Analyseur lexical pour Petit Caml *)

{
  open Lexing
  open Parser
   
  exception Lexing_error of string

  let kwd_tbl = 
    [
        "else", ELSE; "false", FALSE; "function", FUNCTION; "if", IF;
        "in", IN; "let", LET; "match", MATCH; "not", NOT; "rec", REC;
        "then", THEN; "true", TRUE; "with", WITH
    ]

  let id_or_kwd = 
    let h = Hashtbl.create 17 in
    List.iter (fun (s,t) -> Hashtbl.add h s t) kwd_tbl;
    fun s -> 
      try List.assoc s kwd_tbl with _ -> IDENT s

  let newline lexbuf =
    let pos = lexbuf.lex_curr_p in
    lexbuf.lex_curr_p <- 
      { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum }

  let string_buffer = ref ""
}

let digit = ['0'-'9']
let alpha = ['a'-'z' 'A'-'Z']
let ident = alpha (alpha | "_" | "'" | digit)*
let integer = ['0'-'9']+
let space = [' ' '\t']

rule token = parse
  | '\n'    { newline lexbuf; token lexbuf }
  | space+  { token lexbuf }
  | integer as s { INTEGER (int_of_string s) }
  | "\""    { string_buffer := "" ;
              let start_pos = lexbuf.lex_start_pos
              and start_p = lexbuf.lex_start_p
              in
                lex_string lexbuf ;
                lexbuf.lex_start_pos <- start_pos;
                lexbuf.lex_start_p <- start_p;
                STRING (!string_buffer) }
  | "()"    { UNIT }
  | "[]"    { EMPTYLIST }
  | ident as id { id_or_kwd id }
  | "->"    { RIGHTARROW }
  | "::"    { TWOCOLONS }
  | "_"     { UNDERSCORE }
  | '+'     { PLUS }
  | '-'     { MINUS }
  | '*'     { TIMES }
  | '/'     { DIV }
  | "&&"    { AND }
  | "||"    { OR }
  | "|"     { CASE }
  | "<="    { LE }
  | ">="    { GE }
  | '<'     { LT }
  | '>'     { GT }
  | "<>"    { NEQ }
  | '='     { EQ }
  | "["     { LBRACE }
  | "]"     { RBRACE }
  | '('     { LPAREN }
  | ')'     { RPAREN }
  | ','     { COMMA }
  | ";"     { SEMICOLON }
  | "(*"    { comment 0 lexbuf }
  | eof     { EOF }
  | _ as c  { raise (Lexing_error ("illegal character: " ^ String.make 1 c)) }

and comment depth = parse
  | '\n'    { newline lexbuf; comment depth lexbuf }
  | "(*"    { comment (depth + 1) lexbuf }
  | "*)"    { match depth with
                | 0 -> token lexbuf
                | _ -> comment (depth - 1) lexbuf }
  | eof     { raise (Lexing_error "unterminated comment") }
  | _       { comment depth lexbuf }

and lex_string = parse
  | "\""    { () }
  | "\\\""  { string_buffer := !string_buffer ^ "\"";
              lex_string lexbuf }
  | "\\n"   { string_buffer := !string_buffer ^ "\n";
              lex_string lexbuf }
  | "\\\\"  { string_buffer := !string_buffer ^ "\\";
              lex_string lexbuf }
  | "\\"    { raise (Lexing_error "illegal \\ in string") }
  | "\n" eof | eof
            { raise (Lexing_error "unterminated string") }
  | "\n"    { raise (Lexing_error "illegal \\n in string") }
  | _ as c  { string_buffer := !string_buffer ^ (String.make 1 c) ;
              lex_string lexbuf }