Recursive descent parser generator

Write a recursive descent parser generator that takes a description of a grammar as input and outputs the source code for a parser in the same language as the generator. (So a generator written in C++ would output C++ source code for the parser.) You can assume that all of the rules have been preprocessed into a form suitable for the construction of a recursive descent parser. Check the following links for more details.

Recursive descent parser generator is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

Use the parser generator and a grammar file to build a parser that takes an arithmetic expression and turns it in to three address code. The resulting parser should take this (or something similar) as input:

(one + two) * three - four * five 

And generate this (or something similar) as output:

_0001 = one + two _0002 = _0001 * three _0003 = four * five _0004 = _0002 - _0003 

This is the code containing the grammar definition and the parser generator.

using System.CodeDom.Compiler; using System.Linq.Expressions; public abstract record AST() {  public abstract string Emit(TextWriter w); } public record Leaf(string Text) : AST {  public override string ToString() =>   $"\"{Text}\"";  public override string Emit(TextWriter w) =>  Text; } public record Node(char Op, AST[] Operands) : AST {  static int NodeID = 0;  public override string ToString() =>   $"{Op}({string.Join(", ", Operands.AsEnumerable())})";  public override string Emit(TextWriter w)  {  var names = Operands.Select(o => o.Emit(w)).ToList();  var id = ++NodeID;  var name = $"_{id:0000}";  switch (Operands.Length)  {  case 1:  w.WriteLine($"{name} = {Op}{names[0]}");  break;  case 2:  w.WriteLine($"{name} = {names[0]} {Op} {names[1]}");  break;  }  return name;  } } class ParserGenerator {  /*  E --> T {( "+" | "-" ) T}  T --> F {( "*" | "/" ) F}  F --> P ["^" F]  P --> v | "(" E ")" | "-" T"  */  readonly Dictionary<string, Expression<Func<Rule>>> grammar = new()  {  ["E"] = () => Seq(NonTerm("T"), While(Operator('+', '-'), Binary(NonTerm("T")))),  ["T"] = () => Seq(NonTerm("F"), While(Operator('*', '/'), Binary(NonTerm("F")))),  ["F"] = () => Seq(NonTerm("P"), If(Operator('^'), Binary(NonTerm("F")))),  ["P"] = () => Options(  Identifier(),  Seq('(', NonTerm("E"), ')'),  Seq(Operator('-'), Unary(NonTerm("T")))),  };  static void Main()  {  using var file = File.CreateText("RDParser.cs");  using IndentedTextWriter w = new(file);  ParserGenerator g = new();  g.Generate("E", w);  }  static void Demo()  {  EParser p = new();  var ast = p.Parse("(one + two) * three - four * five");  Console.WriteLine("Abstract syntax tree:");  Console.WriteLine(ast);  Console.WriteLine();  Console.WriteLine("Generated code:");  ast.Emit(Console.Out);  }  void Generate(string name, IndentedTextWriter w)  {  var rule = grammar[name];  w.WriteLine("using System.Text;");  w.WriteLine();  w.WriteLine($"public class {name}Parser");  Block(w, () =>  {  w.WriteLine("static void Expect(IEnumerator<char> c, char ch)");  Block(w, () =>  {  w.WriteLine("if (Consume(c) != ch)");  w.WriteLine(" throw new Exception();");  });  w.WriteLine();  w.WriteLine("static char Consume(IEnumerator<char> c)");  Block(w, () =>  {  w.WriteLine("var result = c.Current;");  w.WriteLine("_ = c.MoveNext();");  w.WriteLine("return result;");  });  w.WriteLine();  w.WriteLine("static Node MkNode(char op, params AST[] operands) =>");  w.WriteLine(" new(op, operands);");  w.WriteLine();  w.WriteLine("static Leaf MkLeaf(string text) =>");  w.WriteLine(" new(text);");  w.WriteLine();  w.WriteLine($"public AST Parse(string text)");  Block(w, () =>  {  w.WriteLine("const char End = '\\u0003';");  w.WriteLine("text = text.Replace(\" \", \"\") + End;");  w.WriteLine("var c = text.GetEnumerator();");  w.WriteLine("_ = c.MoveNext();");  w.WriteLine($"AST t = {name}(c);");  w.WriteLine("Expect(c, End);");  w.WriteLine("return t;");  });  foreach (var (n, r) in grammar)  {  w.WriteLine();  w.WriteLine($"AST {n}(IEnumerator<char> c)");  Block(w, () =>  {  NonTerminal(w, 't', r.Body);  w.WriteLine("return t;");  });  }  });  }  void NonTerminal(IndentedTextWriter w, char varName, Expression e, int index = 0)  {  if (e is UnaryExpression cast)  {  if (index == 0)  w.WriteLine("Consume(c);");  else  w.WriteLine($"Expect(c, '{cast.Operand}');");  return;  }  var call = e as MethodCallExpression;  var args = call!.Arguments!;  switch (call!.Method.Name)  {  case nameof(NonTerm):  w.WriteLine($"AST {varName} = {(args[0] as ConstantExpression)!.Value}(c);");  break;  case nameof(Seq):  foreach (var arg in ((NewArrayExpression)args[0]).Expressions)  {  NonTerminal(w, varName, arg, index++);  }  break;  case nameof(While):  w.WriteLine($"while ({Peek(args[0])})");  ++varName;  Block(w, () =>  {  NonTerminal(w, varName, args[1]);  });  --varName;  break;  case nameof(If):  w.WriteLine($"if ({Peek(args[0])})");  ++varName;  Block(w, () =>  {  NonTerminal(w, varName, args[1]);  });  --varName;  break;  case nameof(Binary):  w.WriteLine("var op = Consume(c);");  var t = (char)(varName - 1);  NonTerminal(w, varName, args[0]);  w.WriteLine($"{t} = MkNode(op, {t}, {varName});");  break;  case nameof(Unary):  w.WriteLine("var op = Consume(c);");  NonTerminal(w, varName, args[0]);  w.WriteLine($"{varName} = MkNode(op, {varName});");  break;  case nameof(Options):  var el = "";  w.WriteLine($"AST {varName};");  t = varName;  ++varName;  foreach (var arg in ((NewArrayExpression)args[0]).Expressions)  {  w.WriteLine($"{el}if ({Peek(arg)})");  Block(w, () =>  {  NonTerminal(w, varName, arg, index);  w.WriteLine($"{t} = {varName};");  });  el = "else ";  }  --varName;  w.WriteLine("else");  w.WriteLine(" throw new Exception();");  break;  case nameof(Operator):  break;  case nameof(Identifier):  w.WriteLine($"StringBuilder {varName}_ = new();");  w.WriteLine("while (char.IsLetter(c.Current))");  Block(w, () =>  {  w.WriteLine($"{varName}_.Append(Consume(c));");  });  w.WriteLine($"AST {varName} = MkLeaf({varName}_.ToString());");  break;  default:  throw new NotImplementedException();  }  }  string Peek(Expression e)  {  if (e is UnaryExpression cast)  {  return $"c.Current == '{cast.Operand}'";  }  var call = e as MethodCallExpression;  var args = call!.Arguments;  return call!.Method.Name switch  {  nameof(Seq) =>  Peek(((NewArrayExpression)args[0]).Expressions[0]),  nameof(Options) =>  string.Join(" || ", ((NewArrayExpression)args[0]).Expressions.Select(arg => "(" + Peek(arg) + ")")),  nameof(Operator) =>  string.Join(" || ", ((NewArrayExpression)args[0]).Expressions.Select(arg => $"c.Current == '{arg}'")),  nameof(Binary) or nameof(Unary) =>  $"c.Current == '{args[0]}'",  nameof(Identifier) =>  "char.IsLetter(c.Current)",  _ => throw new NotImplementedException(),  };  }  static void Block(IndentedTextWriter w, Action action)  {  w.WriteLine("{");  w.Indent++;  action();  w.Indent--;  w.WriteLine("}");  }  static Rule NonTerm(string name) => new();  static Rule Operator(params char[] ops) => new();  static Rule Seq(params Rule[] rules) => new();  static Rule Options(params Rule[] rules) => new();  static Rule If(Rule cond, Rule r) => new();  static Rule While(Rule cond, Rule r) => new();  static Rule Identifier() => new();  static Rule Binary(Rule r) => new();  static Rule Unary(Rule r) => new(); } record Rule() {  public static implicit operator Rule(char ch) => new(); } 
Output:

Running Main from the code above produces the following recursive-descent parser:

using System.Text; public class EParser {  static void Expect(IEnumerator<char> c, char ch)  {  if (Consume(c) != ch)  throw new Exception();  }    static char Consume(IEnumerator<char> c)  {  var result = c.Current;  _ = c.MoveNext();  return result;  }    static Node MkNode(char op, params AST[] operands) =>  new(op, operands);    static Leaf MkLeaf(string text) =>  new(text);    public AST Parse(string text)  {  const char End = '\u0003';  text = text.Replace(" ", "") + End;  var c = text.GetEnumerator();  _ = c.MoveNext();  AST t = E(c);  Expect(c, End);  return t;  }    AST E(IEnumerator<char> c)  {  AST t = T(c);  while (c.Current == '+' || c.Current == '-')  {  var op = Consume(c);  AST u = T(c);  t = MkNode(op, t, u);  }  return t;  }    AST T(IEnumerator<char> c)  {  AST t = F(c);  while (c.Current == '*' || c.Current == '/')  {  var op = Consume(c);  AST u = F(c);  t = MkNode(op, t, u);  }  return t;  }    AST F(IEnumerator<char> c)  {  AST t = P(c);  if (c.Current == '^')  {  var op = Consume(c);  AST u = F(c);  t = MkNode(op, t, u);  }  return t;  }    AST P(IEnumerator<char> c)  {  AST t;  if (char.IsLetter(c.Current))  {  StringBuilder u_ = new();  while (char.IsLetter(c.Current))  {  u_.Append(Consume(c));  }  AST u = MkLeaf(u_.ToString());  t = u;  }  else if (c.Current == '(')  {  Consume(c);  AST u = E(c);  Expect(c, ')');  t = u;  }  else if (c.Current == '-')  {  var op = Consume(c);  AST u = T(c);  u = MkNode(op, u);  t = u;  }  else  throw new Exception();  return t;  } } 

Running the Demo method calls the generated parser and produces this output:

Abstract syntax tree: -(*(+("one", "two"), "three"), *("four", "five")) Generated code: _0001 = one + two _0002 = _0001 * three _0003 = four * five _0004 = _0002 - _0003 
Works with: C++11

This program translates an annotated LL(1) grammar into a C++ lexer plus parser. Each rule is required to return a string of some kind and the return values of the already matched nonterminals (and matched text of terminals) can be accessed with $1, $2, etc. which are replaced by the appropriate string variable.

It can't handle newlines as part of the grammar, the error checking is fairly limited and the error reporting is basically non-existent, but the parser it generates (not shown below) is human readable.

#include <fstream> #include <iostream> #include <map> #include <regex> #include <set> #include <sstream> #include <string> using namespace std; map<string, string> terminals; map<string, vector<vector<string>>> nonterminalRules; map<string, set<string>> nonterminalFirst; map<string, vector<string>> nonterminalCode; int main(int argc, char **argv) {  if (argc < 3) {  cout << "Usage: <input file> <output file>" << endl;  return 1;  }  ifstream inFile(argv[1]);  ofstream outFile(argv[2]);  regex blankLine(R"(^\s*$)");  regex terminalPattern(R"((\w+)\s+(.+))");  regex rulePattern(R"(^!!\s*(\w+)\s*->\s*((?:\w+\s*)*)$)");  regex argPattern(R"(\$(\d+))");  smatch results;  // Read terminal patterns  string line;  while (true) {  getline(inFile, line);  // Terminals section ends with a blank line  if (regex_match(line, blankLine))  break;  regex_match(line, results, terminalPattern);  terminals[results[1]] = results[2];  }  outFile << "#include <iostream>" << endl  << "#include <fstream>" << endl  << "#include <string>" << endl  << "#include <regex>" << endl  << "using namespace std;" << endl  << endl;  // Generate the token processing functions  outFile << "string input, nextToken, nextTokenValue;" << endl  << "string prevToken, prevTokenValue;" << endl  << endl  << "void advanceToken() {" << endl  << "	static smatch results;" << endl  << endl  << "	prevToken = nextToken;" << endl  << "	prevTokenValue = nextTokenValue;" << endl  << endl;  for (auto i = terminals.begin(); i != terminals.end(); ++i) {  string name = i->first + "_pattern";  string pattern = i->second;  outFile << "	static regex " << name << "(R\"(^\\s*(" << pattern << "))\");" << endl  << "	if (regex_search(input, results, " << name << ", regex_constants::match_continuous)) {" << endl  << "	nextToken = \"" << i->first << "\";" << endl  << "	nextTokenValue = results[1];" << endl  << "	input = regex_replace(input, " << name << ", \"\");" << endl  << "	return;" << endl  << "	}" << endl  << endl;  }  outFile << "	static regex eof(R\"(\\s*)\");" << endl  << "	if (regex_match(input, results, eof, regex_constants::match_continuous)) {" << endl  << "	nextToken = \"\";" << endl  << "	nextTokenValue = \"\";" << endl  << "	return;" << endl  << "	}" << endl  << endl  << "	throw \"Unknown token\";" << endl  << "}" << endl  << endl  << "bool same(string symbol) {" << endl  << "	if (symbol == nextToken) {" << endl  << "	advanceToken();" << endl  << "	return true;" << endl  << "	}" << endl  << "	return false;" << endl  << "}" << endl  << endl;  // Copy the header code to the output  while (true) {  getline(inFile, line);  // Copy lines until we reach the first rule  if (regex_match(line, results, rulePattern))  break;  outFile << line << endl;  }  // Build the nonterminal table  while (true) {  // results already contains the last matched rule  string name = results[1];  stringstream ss(results[2]);  string tempString;  vector<string> tempVector;  while (ss >> tempString)  tempVector.push_back(tempString);  nonterminalRules[name].push_back(tempVector);  // Read code until another rule is found  string code = "";  while (true) {  getline(inFile, line);  if (!inFile || regex_match(line, results, rulePattern))  break;  // Replace $1 with results[1], etc.  line = regex_replace(line, argPattern, "results[$1]");  code += line + "\n";  }  nonterminalCode[name].push_back(code);  // Stop when we reach the end of the file  if (!inFile)  break;  }  // Generate the first sets, inefficiently  bool done = false;  while (!done)  for (auto i = nonterminalRules.begin(); i != nonterminalRules.end(); ++i) {  string name = i->first;  done = true;  if (nonterminalFirst.find(i->first) == nonterminalFirst.end())  nonterminalFirst[i->first] = set<string>();  for (int j = 0; j < i->second.size(); ++j) {  if (i->second[j].size() == 0)  nonterminalFirst[i->first].insert("");  else {  string first = i->second[j][0];  if (nonterminalFirst.find(first) != nonterminalFirst.end()) {  for (auto k = nonterminalFirst[first].begin(); k != nonterminalFirst[first].end(); ++k) {  if (nonterminalFirst[name].find(*k) == nonterminalFirst[name].end()) {  nonterminalFirst[name].insert(*k);  done = false;  }  }  } else if (nonterminalFirst[name].find(first) == nonterminalFirst[name].end()) {  nonterminalFirst[name].insert(first);  done = false;  }  }  }  }  // Generate function signatures for the nonterminals  for (auto i = nonterminalRules.begin(); i != nonterminalRules.end(); ++i) {  string name = i->first + "_rule";  outFile << "string " << name << "();" << endl;  }  outFile << endl;  // Generate the nonterminal functions  for (auto i = nonterminalRules.begin(); i != nonterminalRules.end(); ++i) {  string name = i->first + "_rule";  outFile << "string " << name << "() {" << endl  << "	vector<string> results;" << endl  << "	results.push_back(\"\");" << endl  << endl;  // Check if this rule can match an empty string  int epsilon = -1;  for (int j = 0; epsilon == -1 && j < i->second.size(); ++j)  if (i->second[j].size() == 0)  epsilon = j;  // Generate each production  for (int j = 0; j < i->second.size(); ++j) {  // Nothing to generate for an empty rule  if (j == epsilon)  continue;  string token = i->second[j][0];  if (terminals.find(token) != terminals.end())  outFile << "	if (nextToken == \"" << i->second[j][0] << "\") {" << endl;  else {  outFile << "	if (";  bool first = true;  for (auto k = nonterminalFirst[token].begin(); k != nonterminalFirst[token].end(); ++k, first = false) {  if (!first)  outFile << " || ";  outFile << "nextToken == \"" << (*k) << "\"";  }  outFile << ") {" << endl;  }  for (int k = 0; k < i->second[j].size(); ++k) {  if (terminals.find(i->second[j][k]) != terminals.end()) {  outFile << "	if(same(\"" << i->second[j][k] << "\"))" << endl  << "	results.push_back(prevTokenValue);" << endl  << "	else" << endl  << "	throw \"Syntax error - mismatched token\";" << endl;  } else  outFile << "	results.push_back(" << i->second[j][k] << "_rule());" << endl;  }  // Copy rule code to output  outFile << nonterminalCode[i->first][j];  outFile << "	}" << endl << endl;  }  if (epsilon == -1)  outFile << "	throw \"Syntax error - unmatched token\";" << endl;  else  outFile << nonterminalCode[i->first][epsilon];  outFile << "}" << endl << endl;  }  // Generate the main function  outFile << "int main(int argc, char **argv) {" << endl  << "	if(argc < 2) {" << endl  << "	cout << \"Usage: <input file>\" << endl;" << endl  << "	return 1;" << endl  << "	}" << endl  << endl  << "	ifstream file(argv[1]);" << endl  << "	string line;" << endl  << "	input = \"\";" << endl  << endl  << "	while(true) {" << endl  << "	getline(file, line);" << endl  << "	if(!file) break;" << endl  << "	input += line + \"\\n\";" << endl  << "	}" << endl  << endl  << "	advanceToken();" << endl  << endl  << "	start_rule();" << endl  << "}" << endl; } 

Example grammar:

plus	\+ minus	- times	\* div	/ open	\( close	\) num	[0-9]+ var	[a-z]+ string nextLabel() {	static string label = "0000";	for(int i = label.length() - 1; i >= 0; --i) {	if(label[i] == '9')	label[i] = '0';	else {	++label[i];	break;	}	}	return "_" + label; } !! start -> expr start2 if($2 == "")	return $1; else {	string label = nextLabel();	cout << label << " = " << $1 << " " << $2 << endl;	return label; } !! start2 -> plus start return "+ " + $2; !! start2 -> minus start return "- " + $2; !! start2 -> return ""; !! expr -> term expr2 if($2 == "")	return $1; else {	string label = nextLabel();	cout << label << " = " << $1 << " " << $2 << endl;	return label; } !! expr2 -> times expr return "* " + $2; !! expr2 -> div expr return "/ " + $2; !! expr2 -> return ""; !! term -> var return $1; !! term -> num return $1; !! term -> open start close return $2; 

Example input to parser (filename passed through command line):

(one + two) * three + four * five 

Output (to standard out):

_0001 = one + two _0002 = _0001 * three _0003 = four * five _0004 = _0002 + _0003 
Translation of: Phix
Const NULL As Any Ptr = 0 Type Token  tType As String * 10 ' "SYMBOL", "IDENT", "EOF"  tVal As String * 50 End Type Type ASTNode  tType As String * 10 ' "SYMBOL" o "IDENT"  tVal As String * 50  izda As ASTNode Ptr  dcha As ASTNode Ptr End Type Dim Shared src As String Dim Shared sdx As Integer Dim Shared currChar As String * 1 Dim Shared tok As Token Dim Shared nxt As Integer Declare Function primary() As ASTNode Ptr Function currentChar() As String  Return Iif(sdx > Len(src), "", Mid(src, sdx, 1)) End Function Sub skipSpaces()  Do  If sdx > Len(src) Then Exit Do  currChar = currentChar()  If Instr(" " + Chr(9) + Chr(13) + Chr(10), currChar) = 0 Then Exit Do  sdx += 1  Loop End Sub Sub nextToken()  skipSpaces()  If sdx > Len(src) Then  tok.tType = "EOF"  tok.tVal = ""  Return  End If    currChar = currentChar()    If Instr("()+-/*", currChar) > 0 Then  tok.tType = "SYMBOL"  tok.tVal = currChar  sdx += 1  Elseif (currChar >= "a" And currChar <= "z") Or (currChar >= "A" And currChar <= "Z") Then  Dim As Integer startPos = sdx  Do  sdx += 1  If sdx > Len(src) Then Exit Do  currChar = currentChar()  Loop While (currChar = "_") Or _  (currChar >= "a" And currChar <= "z") Or _  (currChar >= "A" And currChar <= "Z") Or _  (currChar >= "0" And currChar <= "9")  tok.tType = "IDENT"  tok.tVal = Mid(src, startPos, sdx - startPos)  Else  Print "Error: unexpected character '" & currChar & "'"  End 1  End If End Sub Function newNode(tType As String, tVal As String, izda As ASTNode Ptr = NULL, dcha As ASTNode Ptr = NULL) As ASTNode Ptr  Dim As ASTNode Ptr node = Allocate(Sizeof(ASTNode))  If node = NULL Then  Print "Error: memory allocation failed"  End 1  End If    node->tType = tType  node->tVal = tVal  node->izda = izda  node->dcha = dcha    Return node End Function Function mulExpr() As ASTNode Ptr  Dim As ASTNode Ptr res = primary()  Do  If tok.tType <> "SYMBOL" Or (tok.tVal <> "*" And tok.tVal <> "/") Then Exit Do  Dim As String opVal = tok.tVal  nextToken()  res = newNode("SYMBOL", opVal, res, primary())  Loop    Return res End Function Function sumExpr() As ASTNode Ptr  Dim As ASTNode Ptr res = mulExpr()  Do  If tok.tType <> "SYMBOL" Or (tok.tVal <> "+" And tok.tVal <> "-") Then Exit Do  Dim As String opVal = tok.tVal  nextToken()  res = newNode("SYMBOL", opVal, res, mulExpr())  Loop    Return res End Function Function primary() As ASTNode Ptr  Dim As ASTNode Ptr res  If tok.tType = "IDENT" Then  res = newNode(tok.tType, tok.tVal)  nextToken()  Elseif tok.tType = "SYMBOL" And tok.tVal = "(" Then  nextToken()  res = sumExpr()  If tok.tType <> "SYMBOL" Or tok.tVal <> ")" Then  Print "Error: ')' was expected"  End 1  End If  nextToken()  Else  Print "Error: invalid primary expression"  End 1  End If    Return res End Function Sub printAST(node As ASTNode Ptr, indent As Integer = 0)  If node = NULL Then  Print "Error: null AST node in printAST"  End 1  End If    Dim i As Integer  For i = 1 To indent  Print " ";  Next    If node->tType = "SYMBOL" Then  Print "{{`" & Trim(node->tType) & "`,"  For i = 1 To indent  Print " ";  Next  Print " `" & Trim(node->tVal) & "`},"    printAST(node->izda, indent + 1)  Print ","    printAST(node->dcha, indent + 1)    Print "}";  Elseif node->tType = "IDENT" Then  Print "{`" & Trim(node->tType) & "`,"  For i = 1 To indent  Print " ";  Next  Print " `" & Trim(node->tVal) & "`}";  Else  Print "Error: Unknown AST node type: " & node->tType  End 1  End If End Sub Function showAST(ast As ASTNode Ptr) As String  If ast = NULL Then  Print "Error: null AST node"  End 1  End If    If ast->tType = "SYMBOL" Then  Dim lhs As String = showAST(ast->izda)  Dim rhs As String = showAST(ast->dcha)  Dim nid As String  nid = "_" & Right("0000" & Str(nxt), 4)  Print nid & " = " & lhs & " " & Trim(ast->tVal) & " " & rhs  nxt += 1  Return nid  Elseif ast->tType = "IDENT" Then  Return Trim(ast->tVal)  Else  Print "Error: unknown AST node type: " & ast->tType  End 1  End If End Function Sub Parse(source As String)  src = source  sdx = 1  nxt = 1  nextToken()    Dim As ASTNode Ptr ast = sumExpr()    Print "Complete syntactic tree:"  printAST(ast)  Print  Print !"\nIntermediate assignments:"  Dim As String result = showAST(ast)  End Sub ' Main program Parse("(one + two) * three - four * five") Sleep 
Output:
Complete syntactic tree: {{`SYMBOL`, `-`}, {{`SYMBOL`, `*`}, {{`SYMBOL`, `+`}, {`IDENT`, `one`}, {`IDENT`, `two`}}, {`IDENT`, `three`}}, {{`SYMBOL`, `*`}, {`IDENT`, `four`}, {`IDENT`, `five`}}} Intermediate assignments: _0001 = one + two _0002 = _0001 * three _0003 = four * five _0004 = _0002 - _0003

The standard library already contains a recursive descent parser for Go programs or expressions, written in Go itself, whose output is an abstract syntax tree (AST) representing such code.

I've therefore applied this parser to the expression designated by the task, which only involves binary expressions and identifiers, and written a separate routine to convert the resulting AST into three-address code.

package main import (  "fmt"  "go/ast"  "go/parser"  "log" ) func labelStr(label int) string {  return fmt.Sprintf("_%04d", label) } type binexp struct {  op, left, right string  kind, index int } func main() {  x := "(one + two) * three - four * five"  fmt.Println("Expression to parse: ", x)  f, err := parser.ParseExpr(x)  if err != nil {  log.Fatal(err)  }  fmt.Println("\nThe abstract syntax tree for this expression:")  ast.Print(nil, f)  fmt.Println("\nThe corresponding three-address code:")  var binexps []binexp  // Inspect nodes in depth-first order.  ast.Inspect(f, func(n ast.Node) bool {  switch x := n.(type) {  case *ast.BinaryExpr:  sx, ok1 := x.X.(*ast.Ident)  sy, ok2 := x.Y.(*ast.Ident)  op := x.Op.String()  if ok1 && ok2 {  binexps = append(binexps, binexp{op, sx.Name, sy.Name, 3, 0})  } else if !ok1 && ok2 {  binexps = append(binexps, binexp{op, "<addr>", sy.Name, 2, 0})  } else if ok1 && !ok2 {  binexps = append(binexps, binexp{op, sx.Name, "<addr>", 1, 0})  } else {  binexps = append(binexps, binexp{op, "<addr>", "<addr>", 0, 0})  }  }  return true  })  for i := 0; i < len(binexps); i++ {  binexps[i].index = i  }  label, last := 0, -1  var ops, args []binexp  var labels []string  for i, be := range binexps {  if be.kind == 0 {  ops = append(ops, be)  }  if be.kind != 3 {  continue  }  label++  ls := labelStr(label)  fmt.Printf(" %s = %s %s %s\n", ls, be.left, be.op, be.right)  for j := i - 1; j > last; j-- {  be2 := binexps[j]  if be2.kind == 2 {  label++  ls2 := labelStr(label)  fmt.Printf(" %s = %s %s %s\n", ls2, ls, be2.op, be2.right)  ls = ls2  be = be2  } else if be2.kind == 1 {  label++  ls2 := labelStr(label)  fmt.Printf(" %s = %s %s %s\n", ls2, be2.left, be2.op, ls)  ls = ls2  be = be2  }  }  args = append(args, be)  labels = append(labels, ls)  lea, leo := len(args), len(ops)  for lea >= 2 {  if i < len(binexps)-1 && args[lea-2].index <= ops[leo-1].index {  break  }  label++  ls2 := labelStr(label)  fmt.Printf(" %s = %s %s %s\n", ls2, labels[lea-2], ops[leo-1].op, labels[lea-1])  ops = ops[0 : leo-1]  args = args[0 : lea-1]  labels = labels[0 : lea-1]  lea--  leo--  args[lea-1] = be  labels[lea-1] = ls2  }  last = i  } } 
Output:
Expression to parse: (one + two) * three - four * five The abstract syntax tree for this expression: 0 *ast.BinaryExpr { 1 . X: *ast.BinaryExpr { 2 . . X: *ast.ParenExpr { 3 . . . Lparen: 1 4 . . . X: *ast.BinaryExpr { 5 . . . . X: *ast.Ident { 6 . . . . . NamePos: 2 7 . . . . . Name: "one" 8 . . . . . Obj: *ast.Object { 9 . . . . . . Kind: bad 10 . . . . . . Name: "" 11 . . . . . } 12 . . . . } 13 . . . . OpPos: 6 14 . . . . Op: + 15 . . . . Y: *ast.Ident { 16 . . . . . NamePos: 8 17 . . . . . Name: "two" 18 . . . . . Obj: *(obj @ 8) 19 . . . . } 20 . . . } 21 . . . Rparen: 11 22 . . } 23 . . OpPos: 13 24 . . Op: * 25 . . Y: *ast.Ident { 26 . . . NamePos: 15 27 . . . Name: "three" 28 . . . Obj: *(obj @ 8) 29 . . } 30 . } 31 . OpPos: 21 32 . Op: - 33 . Y: *ast.BinaryExpr { 34 . . X: *ast.Ident { 35 . . . NamePos: 23 36 . . . Name: "four" 37 . . . Obj: *(obj @ 8) 38 . . } 39 . . OpPos: 28 40 . . Op: * 41 . . Y: *ast.Ident { 42 . . . NamePos: 30 43 . . . Name: "five" 44 . . . Obj: *(obj @ 8) 45 . . } 46 . } 47 } The corresponding three-address code: _0001 = one + two _0002 = _0001 * three _0003 = four * five _0004 = _0002 - _0003 

J

J's native recursive descent parser is adequate for this task, if we map names appropriately.

Implementation:

cocurrent 'base' inlocale=: 4 :0 L:0  x,'_',y,'_' ) parse=: 3 :0  sentence=. ;:y  opinds=. (;:'+*-')i.sentence  opfuns=. (;:'plus times minus') inlocale 'base'  scratch=. cocreate''  coinsert__scratch 'base'  names=. ~.sentence#~_1<:nc sentence  (names inlocale scratch)=: names  r=. do__scratch ;:inv opinds}((#sentence)#"0 opfuns),sentence  codestroy__scratch''  r ) term=: 1 :0  2 :('m''',m,'''expr n') ) expr=:1 :0 :  r=. genname''  emit r,'=:',x,m,y  r ) plus=: '+' expr times=: '*' term minus=: '-' expr N=: 10000 genname=: 3 :0  'z',}.":N=: N+1 ) emit=: smoutput 

Task example:

 parse '(one + two) * three - four * five' z0001=:four*five z0002=:one+two z0003=:z0002*three z0004=:z0003-z0001 z0004 

See also https://github.com/jsoftware/general_misc/blob/master/trace.ijs for a model of the underlying parser being employed here.

The Julia compiler's own parser is a recursive descent parser, and can be used directly here.

const one, two, three, four, five, six, seven, eight, nine = collect(1:9) function testparser(s)  cod = Meta.parse(s)  println(Meta.lower(Main, cod)) end testparser("(one + two) * three - four * five") 
Output:
$(Expr(:thunk, CodeInfo( @ none within `top-level scope' 1 ─ %1 = one + two │ %2 = %1 * three │ %3 = four * five │ %4 = %2 - %3 └── return %4 ))) 
#!/usr/bin/perl use strict; # https://rosettacode.org/wiki/Recursive_descent_parser_generator use warnings; use Path::Tiny; my $h = qr/\G\s*/; my $identifier = qr/$h([a-z]\w*)\b/i; my $literal = qr/$h(['"])(.+?)\1/s; my (%rules, %called, $usercode, %patches); my $filename = './generatedparser.pl'; sub node { bless [ @_[1..$#_] ], $_[0] } sub error { die "ERROR: ", s/\G\s*\K/<**ERROR @_**>/r, "\n" } sub want { /$h\Q$_[1]\E/gc ? shift : error "missing '$_[1]' " } sub addin { node $_[0] => ref $_[1] eq $_[0] ? @{$_[1]} : $_[1], pop } local $_ = do { local $/; @ARGV ? <> : <DATA> }; # the EBNF $usercode = s/^(#usercode.*)//ms ? $1 : "# no usercode included\n";; $patches{PATCHUSER} = $usercode . "#end usercode\n"; # grammar support code s/^\h*#.*\n//gm; # remove comment lines $patches{PATCHGRAMMAR} = s/^(?:\h*\n)+//r =~ s/\n(?:\h*\n)+\z//r; while( /$identifier\s*=/gc ) # the start of a rule  {  my $name = $1;  $rules{startsymbol} //= node RULE => $name;  my $tree = expr(0);  $rules{$name} = $rules{$name} ? addin ALT => $rules{$name}, $tree : $tree;  /$h[.;]/gc or error 'missing rule terminator, needs . or ;';  } /\G\s*\z/ or error "incomplete parse at ", substr $_, pos($_) // 0; %rules or error "no rules found "; delete @called{keys %rules}; %called and die "\nERROR: undefined rule(s) <@{[ keys %called]}>\n"; sub expr # precedence climbing parser for grammer rules  {  my $tree =  /$h(NAME)\b/gc ? node $1 : # internal name matcher  /$identifier/gc ? do { $called{$1}++; node RULE => $1 } :  /$literal/gc ? node LIT => $2 :  /$h<(\w+)>/gc ? node ACTION => $1 :  /$h\[/gc ? node OPTION => want expr(0), ']' :  /$h\{/gc ? node REPEAT => want expr(0), '}' :  /$h\(/gc ? want expr(0), ')' :  error 'Invalid expression';  $tree =  /\G\s+/gc ? $tree :  $_[0] <= 1 && /\G(?=[[<{('"a-z])/gci ? addin SEQ => $tree, expr(2) :  $_[0] <= 0 && /\G\|/gc ? addin ALT => $tree, expr(1) :  return $tree while 1;  } my $perlcode = "# generated code (put in Rule:: package)\n"; for my $rule ( sort keys %rules )  {  my $body = $rules{$rule}->code;  $perlcode .= "\nsub Rule::$rule\n\t{\n\t$body\n\t}\n";  } $perlcode =~ s/^(?:\h*\n)+(?=\h*\})//gm; $perlcode .= "\n# preceding code was generated for rules\n"; $patches{PATCHGENERATED} = $perlcode; sub ALT::code  {  my $all = join " or\n\t", map $_->code, @{ $_[0] };  "( # alt\n\t$all )"  } sub SEQ::code  {  my $all = join " and\n\t", map $_->code, @{ $_[0] };  "( # seq\n\t$all )"  } sub REPEAT::code { "do { # repeat\n\t1 while @{[ $_[0][0]->code ]} ; 1 }" } sub OPTION::code { "( # option\n\t@{[ $_[0][0]->code ]} or 1 )" } sub RULE::code { "Rule::$_[0][0]()" } sub ACTION::code { "( $_[0][0]() || 1 )" } sub NAME::code { "( /\\G\$whitespace(\\w+)/gc and push \@stack, \$1 )" } sub LIT::code { "( /\\G\$whitespace(\Q$_[0][0]\E)/gc and push \@stack, \$1 )" } $_ = <<'END'; ##################################### template for generated code #!/usr/bin/perl use strict; # https://rosettacode.org/wiki/Recursive_descent_parser_generator use warnings; # WARNING: this code is generated my @stack; my $whitespace = qr/\s*/; my $grammar = <<'GRAMMAR'; # make grammar rules available to usercode PATCHGRAMMAR GRAMMAR PATCHUSER PATCHGENERATED local $_ = shift // '(one + two) * three - four * five'; eval { begin() }; # eval because it is optional Rule::startsymbol(); eval { end() }; # eval because it is optional /\G\s*\z/ or die "ERROR: incomplete parse\n"; END s/(PATCH\w+)/$patches{$1}/g; # insert grammar variable stuff path( $filename )->spew( $_ ); chmod 0555, $filename; # readonly, executable exec 'perl', $filename, @ARGV or die "exec failed $!"; __DATA__ expr = term { plus term <gen3addr> } . term = factor { times factor <gen3addr> } . factor = primary [ '^' factor <gen3addr> ] . primary = '(' expr ')' <removeparens> | NAME . plus = "+" | "-" . times = "*" | "/" . #usercode -- separator for included code for actions my $temp = '0000'; sub begin { print "parsing: $_\n\n" } sub gen3addr  {  @stack >= 3 or die "not enough on stack";  my @three = splice @stack, -3, 3, my $t = '_' . ++$temp;  print "$t = @three\n";  } sub removeparens  {  @stack >= 3 or die "not enough on stack";  splice @stack, -3, 3, $stack[-2];  } 

Running the above with no arguments uses a default grammar that will solve the specified example. It produces the following perl script (and then runs it).

#!/usr/bin/perl use strict; # https://rosettacode.org/wiki/Recursive_descent_parser_generator use warnings; # WARNING: this code is generated my @stack; my $whitespace = qr/\s*/; my $grammar = <<'GRAMMAR'; # make grammar rules available to usercode expr = term { plus term <gen3addr> } . term = factor { times factor <gen3addr> } . factor = primary [ '^' factor <gen3addr> ] . primary = '(' expr ')' <removeparens> | NAME . plus = "+" | "-" . times = "*" | "/" . GRAMMAR #usercode -- separator for included code for actions my $temp = '0000'; sub begin { print "parsing: $_\n\n" } sub gen3addr  {  @stack >= 3 or die "not enough on stack";  my @three = splice @stack, -3, 3, my $t = '_' . ++$temp;  print "$t = @three\n";  } sub removeparens  {  @stack >= 3 or die "not enough on stack";  splice @stack, -3, 3, $stack[-2];  } #end usercode # generated code (put in Rule:: package) sub Rule::expr  {  ( # seq  Rule::term() and  do { # repeat  1 while ( # seq  Rule::plus() and  Rule::term() and  ( gen3addr() || 1 ) ) ; 1 } )  } sub Rule::factor  {  ( # seq  Rule::primary() and  ( # option  ( # seq  ( /\G$whitespace(\^)/gc and push @stack, $1 ) and  Rule::factor() and  ( gen3addr() || 1 ) ) or 1 ) )  } sub Rule::plus  {  ( # alt  ( /\G$whitespace(\+)/gc and push @stack, $1 ) or  ( /\G$whitespace(\-)/gc and push @stack, $1 ) )  } sub Rule::primary  {  ( # alt  ( # seq  ( /\G$whitespace(\()/gc and push @stack, $1 ) and  Rule::expr() and  ( /\G$whitespace(\))/gc and push @stack, $1 ) and  ( removeparens() || 1 ) ) or  ( /\G$whitespace(\w+)/gc and push @stack, $1 ) )  } sub Rule::startsymbol  {  Rule::expr()  } sub Rule::term  {  ( # seq  Rule::factor() and  do { # repeat  1 while ( # seq  Rule::times() and  Rule::factor() and  ( gen3addr() || 1 ) ) ; 1 } )  } sub Rule::times  {  ( # alt  ( /\G$whitespace(\*)/gc and push @stack, $1 ) or  ( /\G$whitespace(\/)/gc and push @stack, $1 ) )  } # preceding code was generated for rules local $_ = shift // '(one + two) * three - four * five'; eval { begin() }; # eval because it is optional Rule::startsymbol(); eval { end() }; # eval because it is optional /\G\s*\z/ or die "ERROR: incomplete parse\n"; 

The above script can also be run stand-alone and produces the following output.

Output:
parsing: (one + two) * three - four * five _0001 = one + two _0002 = _0001 * three _0003 = four * five _0004 = _0002 - _0003 

Different grammars and input can be specified on the command line.

recursivedescentparsergenerator.pl arithexpr.y '2 * 3 + 4 * 5' 

and giving this file as "arithexpr.y"

# test arith expr expr = term { '+' term <fadd> | '-' term <fsub> } . term = factor { '*' factor <fmul> | '/' factor <fdiv> } . factor = '(' expr ')' <noparen> | NAME . #usercode sub noparen { splice @stack, -3, 3, $stack[-2]; } sub fadd { splice @stack, -3, 3, $stack[-3] + $stack[-1] } sub fsub { splice @stack, -3, 3, $stack[-3] - $stack[-1] } sub fmul { splice @stack, -3, 3, $stack[-3] * $stack[-1] } sub fdiv { splice @stack, -3, 3, $stack[-3] / $stack[-1] } sub begin { print "expr = $_\n" } sub end { print "answer = @{[pop @stack]}\n" } 

will produce the following

Output:
expr = 2 * 3 + 4 * 5 answer = 26 


Technically the task is asking for code which generates something like the following, so I suppose it would actually meet the spec if it began with constant src = """ and ended with """ puts(1,src)... (and like several/most other entries on this page, this does not use a formal grammer)

-- -- demo\rosetta\RecursiveDescentParser.exw -- ======================================= -- with javascript_semantics string src integer ch, sdx sequence tok procedure skip_spaces() while 1 do if sdx>length(src) then exit end if ch = src[sdx] if not find(ch,{' ','\t','\r','\n'}) then exit end if sdx += 1 end while end procedure procedure next_token() -- yeilds one of: -- {"SYMBOL",ch} where ch is one of "()+-/*", or -- {"IDENT",string}, or {"EOF"} skip_spaces() integer tokstart = sdx if sdx>length(src) then tok = {"EOF"} elsif find(ch,"()+-/*") then sdx += 1 tok = {"SYMBOL",ch&""} elsif (ch>='a' and ch<='z') or (ch>='A' and ch<='Z') then while true do sdx += 1 if sdx>length(src) then exit end if ch = src[sdx] if ch!='_' and (ch<'a' or ch>'z') and (ch<'A' or ch>'Z') and (ch<'0' or ch>'9') then exit end if end while tok = {"IDENT",src[tokstart..sdx-1]} else ?9/0 end if end procedure forward function sum_expr() function primary() sequence res if tok[1]="IDENT" then res = tok next_token() elsif tok={"SYMBOL","("} then next_token() res = sum_expr() if tok!={"SYMBOL",")"} then ?9/0 end if next_token() else ?9/0 end if return res end function function mul_expr() sequence res = primary() while true do if tok[1]!="SYMBOL" or not find(tok[2],{"*","/"}) then exit end if res = {tok,res,NULL} next_token() res[3] = primary() end while return res end function function sum_expr() sequence res = mul_expr() while true do if tok[1]!="SYMBOL" or not find(tok[2],{"+","-"}) then exit end if res = {tok,res,NULL} next_token() res[3] = mul_expr() end while return res end function integer nxt = 1 function show_ast(sequence ast) if ast[1][1]="SYMBOL" then string op = ast[1][2], lhs = show_ast(ast[2]), rhs = show_ast(ast[3]), nid = sprintf("_%04d",nxt) printf(1,"%s = %s %s %s\n",{nid,lhs,op,rhs}) nxt += 1 return nid elsif ast[1]="IDENT" then return ast[2] end if ?9/0 end function procedure parse(string source) src = source sdx = 1 next_token() sequence ast = sum_expr() if tok!={"EOF"} then ?9/0 end if pp(ast,{pp_Nest,4}) {} = show_ast(ast) end procedure parse("(one + two) * three - four * five") {} = wait_key() 
Output:
{{`SYMBOL`, `-`}, {{`SYMBOL`, `*`}, {{`SYMBOL`, `+`}, {`IDENT`, `one`}, {`IDENT`, `two`}}, {`IDENT`, `three`}}, {{`SYMBOL`, `*`}, {`IDENT`, `four`}, {`IDENT`, `five`}}} _0001 = one + two _0002 = _0001 * three _0003 = four * five _0004 = _0002 - _0003 

(formerly Perl 6) Instead of writing a full-blown generator, it is possible to solve the task partly by making use of the built-in optimizer and study the relevant AST output

raku --target=optimize -e 'no strict; say ($one + $two) * $three - $four * $five' | tail -11 - QAST::Op(callstatic &say) <sunk> :statement_id<2> say ($one + $two) * $three - $four * $five - QAST::Op(callstatic &infix:<->) <wanted> - - QAST::Op(callstatic &infix:<*>) <wanted> * - QAST::Op(callstatic &infix:<+>) <wanted> :statement_id<3> + - QAST::Var(local __lowered_lex_5) <wanted> $one - QAST::Var(local __lowered_lex_4) <wanted> $two - QAST::Var(local __lowered_lex_3) <wanted> $three - QAST::Op(callstatic &infix:<*>) <wanted> * - QAST::Var(local __lowered_lex_2) <wanted> $four - QAST::Var(local __lowered_lex_1) <wanted> $five - QAST::WVal(Nil) 

As you can see by examining the nested tree, the calculations are done as follows (expressed using a postfix notation)

one two + three * four five * - 
Translation of: Phix
Library: Wren-str
Library: Wren-fmt
import "./str" for Char import "./fmt" for Fmt class RDP {   construct parse(source) {  _src = source  _sdx = 0  _ch = null  _tok = null  _nxt = 1  nextToken()  var ast = sumExpr()  if (_tok[0] != "EOF") Fiber.abort("Something went wrong.")  printAst(ast, 0)  System.print()  showAst(ast)  }  skipSpaces() {  while (true) {  if (_sdx >= _src.count) return  _ch = _src[_sdx]  if (!" \t\r\n".contains(_ch)) return  _sdx = _sdx + 1  }  }  // yields one of:  // ["SYMBOL", ch] where ch is one of "()+-/*", or  // ["IDENT", string] or ["EOF"]  nextToken() {  skipSpaces()  var tokStart = _sdx  if (_sdx >= _src.count) {  _tok = ["EOF"]  } else if ("()+-/*".contains(_ch)) {  _sdx = _sdx + 1  _tok = ["SYMBOL", _ch]  } else if (Char.isAsciiLetter(_ch)) {  while (true) {  _sdx = _sdx + 1  if (_sdx >= _src.count) break  _ch = _src[_sdx]  if (!Char.isAsciiAlphaNum(_ch) && _ch != "_") break  }  _tok = ["IDENT", _src[tokStart..._sdx]]  } else {  Fiber.abort("Invalid token '%(_ch)'.")  }  }  primary() {  var res = []  if (_tok[0] == "IDENT") {  res = _tok.toList  nextToken()  } else if (_tok[0] == "SYMBOL" && _tok[1] == "(") {  nextToken()  res = sumExpr()  if (_tok[0] != "SYMBOL" || _tok[1] != ")") Fiber.abort("Unexpected token '%(_tok)'.")  nextToken()  } else {  Fiber.abort("Unexpected token '%(_tok)'.")  }  return res  }  mulExpr() {  var res = primary()  while (true) {  if (_tok[0] != "SYMBOL" || !"*/".contains(_tok[1])) break  res = [_tok, res, null]  nextToken()  res[2] = primary()  }  return res  }  sumExpr() {  var res = mulExpr()  while (true) {  if (_tok[0] != "SYMBOL" || !"+-".contains(_tok[1])) break  res = [_tok, res, null]  nextToken()  res[2] = mulExpr()  }  return res  }  showAst(ast) {  if (ast[0][0] == "SYMBOL") {  var op = ast[0][1]  var lhs = showAst(ast[1])  var rhs = showAst(ast[2])  var thiz = Fmt.swrite("_$04d", _nxt)  Fmt.print("$s = $s $s $s", thiz, lhs, op, rhs)  _nxt = _nxt + 1  return thiz  } else if (ast[0] == "IDENT") {  return ast[1]  }  Fiber.abort("Something went wrong.")  }  printAst(ast, level) {  for (e in ast) {  var indent = " " * level  if (!(e is List)) {  System.print(indent + e)  } else {  System.print(indent + "{")  printAst(e, level+1)  System.print(indent + "}")   }  }  } } RDP.parse("(one + two) * three - four * five") 
Output:
{ SYMBOL - } { { SYMBOL * } { { SYMBOL + } { IDENT one } { IDENT two } } { IDENT three } } { { SYMBOL * } { IDENT four } { IDENT five } } _0001 = one + two _0002 = _0001 * three _0003 = four * five _0004 = _0002 - _0003