|  | 
|  | 1 | +use logos::Logos; | 
|  | 2 | +use std::fs; | 
|  | 3 | + | 
|  | 4 | +#[derive(Logos, Debug, PartialEq)] | 
|  | 5 | +#[logos(skip r"[ \t\f]+")] // Ignore this regex pattern between tokens | 
|  | 6 | +enum Token { | 
|  | 7 | + #[regex("[a-zA-Z0-9_]+[^;]*;"gm)] | 
|  | 8 | + Expr, | 
|  | 9 | + #[regex("\n+"gm)] | 
|  | 10 | + Newline, | 
|  | 11 | + #[regex("/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*/|--[^\n]*"g)] | 
|  | 12 | + Comment, | 
|  | 13 | +} | 
|  | 14 | + | 
|  | 15 | +fn main() { | 
|  | 16 | + let source = fs::read_to_string("./src/example.sql").unwrap(); | 
|  | 17 | + let mut lex = Token::lexer(&source); | 
|  | 18 | + | 
|  | 19 | + println!("{:?}", source); | 
|  | 20 | + | 
|  | 21 | + // https://github.com/domenicquirl/cstree | 
|  | 22 | + // https://ericlippert.com/2012/06/08/red-green-trees/ | 
|  | 23 | + // | 
|  | 24 | + // So, for example, to parse a struct definition the parser first "enters" the struct definition node, then parses the struct keyword and type name, then parses each field, and finally "finishes" parsing the struct node. | 
|  | 25 | + // | 
|  | 26 | + // 1. lexer: parse string into tokens. cstree will allow us to just move forward until next | 
|  | 27 | + // statement. also, for comments, we should be able to store them separately since we are | 
|  | 28 | + // just walking over the source code. tokens should be expr, whitespace, newlines, comments | 
|  | 29 | + // and eof. does not work because lexer is "dumb". Token != SyntaxKind, so maybe we do not | 
|  | 30 | + // need a real lexer. | 
|  | 31 | + // 2. parser: parse tokens into cst with cstree. nodes are not typed, and we should be able to | 
|  | 32 | + // use pg_query to parse string, and turn that into SyntaxKind tokens. | 
|  | 33 | + // | 
|  | 34 | + // | 
|  | 35 | + // Notes: | 
|  | 36 | + // - maybe we do not real a lexer to parse into statements. we can just use simple string | 
|  | 37 | + // operations? or maybe lexer but with metadata on tokens because normally a token | 
|  | 38 | + // translates into a constant which is not what we want. instead, we want a token Expr to | 
|  | 39 | + // hold the expression string. | 
|  | 40 | + | 
|  | 41 | + // problem: comments | 
|  | 42 | + // general problem: declarative parsing by token will, based on initial research, not work well because we have tokens | 
|  | 43 | + // within tokens (comment can be within a sql query) | 
|  | 44 | + // let parser = any::<_, extra::Err<Simple<char>>>() | 
|  | 45 | + // .and_is(just(';').not()) | 
|  | 46 | + // .repeated() | 
|  | 47 | + // .collect::<String>() | 
|  | 48 | + // .padded() | 
|  | 49 | + // .separated_by(just(';')) | 
|  | 50 | + // .collect::<Vec<String>>(); | 
|  | 51 | + // | 
|  | 52 | + // let comment = just("--") | 
|  | 53 | + // .then( | 
|  | 54 | + // any::<_, extra::Err<Simple<char>>>() | 
|  | 55 | + // .and_is(just('\n').not()) | 
|  | 56 | + // .repeated(), | 
|  | 57 | + // ) | 
|  | 58 | + // .padded(); | 
|  | 59 | + // | 
|  | 60 | + // let comments = comment.parse(source.as_str()); | 
|  | 61 | + // let result = parser.parse(source.as_str()); | 
|  | 62 | + // | 
|  | 63 | + // println!("{:?}", source); | 
|  | 64 | + // println!("{:?}", result); | 
|  | 65 | + // println!("{:?}", comments); | 
|  | 66 | + // | 
|  | 67 | + // let pg_query_result = pg_query::parse("SELECT * FROM contacts").unwrap(); | 
|  | 68 | + // | 
|  | 69 | + // println!("{:?}", pg_query_result.protobuf.nodes()); | 
|  | 70 | +} | 
|  | 71 | + | 
|  | 72 | +#[test] | 
|  | 73 | +fn test_lexer() { | 
|  | 74 | + let input = "select * from contact where id = '123';\n\n-- test comment\n\nselect wrong statement;\n\nselect id,username from contact\n\nselect id,name\nfrom contact -- test inline comment\nwhere id = '123';\n\n"; | 
|  | 75 | + | 
|  | 76 | + let mut lex = Token::lexer(&input); | 
|  | 77 | + | 
|  | 78 | + assert_eq!(lex.next(), Some(Ok(Token::Expr))); | 
|  | 79 | + assert_eq!(lex.slice(), "select * from contact where id = '123';"); | 
|  | 80 | + | 
|  | 81 | + assert_eq!(lex.next(), Some(Ok(Token::Newline))); | 
|  | 82 | + | 
|  | 83 | + assert_eq!(lex.next(), Some(Ok(Token::Comment))); | 
|  | 84 | + assert_eq!(lex.slice(), "-- test comment"); | 
|  | 85 | + | 
|  | 86 | + assert_eq!(lex.next(), Some(Ok(Token::Newline))); | 
|  | 87 | + | 
|  | 88 | + assert_eq!(lex.next(), Some(Ok(Token::Expr))); | 
|  | 89 | + assert_eq!(lex.slice(), "select wrong statement;"); | 
|  | 90 | + | 
|  | 91 | + assert_eq!(lex.next(), Some(Ok(Token::Newline))); | 
|  | 92 | + | 
|  | 93 | + assert_eq!(lex.next(), Some(Ok(Token::Expr))); | 
|  | 94 | + assert_eq!(lex.slice(), "select id,username from contact\n\nselect id,name\nfrom contact -- test inline comment\nwhere id = '123';"); | 
|  | 95 | +} | 
0 commit comments