Skip to content
This repository was archived by the owner on Jul 24, 2024. It is now read-only.

Commit eb04872

Browse files
lexer&parser: full heredoc/nowdoc support (#154)
1 parent b6c145f commit eb04872

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+907
-34
lines changed

meta/dump

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/usr/bin/env sh
2+
3+
set -xe
4+
5+
cargo run --bin php-parser-rs -- $1

meta/snapshot

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/usr/bin/env sh
2+
3+
set -xe
4+
5+
cargo run --bin snapshot

src/lexer/byte_string.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use std::cmp::{Eq, PartialEq};
22
use std::fmt::{Debug, Display, Formatter, Result};
3-
use std::ops::Deref;
3+
use std::ops::{Deref, DerefMut};
44
use std::str::from_utf8;
55

66
/// A wrapper for Vec<u8> that provides a human-readable Debug impl and
@@ -98,6 +98,12 @@ impl Deref for ByteString {
9898
}
9999
}
100100

101+
impl DerefMut for ByteString {
102+
fn deref_mut(&mut self) -> &mut Self::Target {
103+
&mut self.0
104+
}
105+
}
106+
101107
#[cfg(test)]
102108
mod tests {
103109
use super::*;

src/lexer/error.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ pub enum SyntaxError {
1414
InvalidOctalLiteral(Span),
1515
InvalidUnicodeEscape(Span),
1616
UnpredictableState(Span),
17+
InvalidDocIndentation(Span),
18+
InvalidDocBodyIndentationLevel(usize, Span),
1719
}
1820

1921
impl Display for SyntaxError {
@@ -59,6 +61,17 @@ impl Display for SyntaxError {
5961
"Syntax Error: Reached an unpredictable state on line {} column {}",
6062
span.0, span.1
6163
),
64+
Self::InvalidDocIndentation(span) => write!(
65+
f,
66+
"Syntax Error: Invalid indentation - cannot use tabs and spaces on line {}",
67+
span.0
68+
),
69+
Self::InvalidDocBodyIndentationLevel(expected, span) => write!(
70+
f,
71+
"Syntax Error: Invalid body indentation level - expecting an indentation level of at least {} on line {}",
72+
expected,
73+
span.0
74+
),
6275
}
6376
}
6477
}

src/lexer/mod.rs

Lines changed: 204 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ use crate::lexer::token::TokenKind;
2020
use crate::ident;
2121
use crate::ident_start;
2222

23+
pub use self::state::DocStringKind;
24+
use self::token::DocStringIndentationType;
25+
2326
#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)]
2427
pub struct Lexer;
2528

@@ -65,10 +68,12 @@ impl Lexer {
6568
// The double quote state is entered when inside a double-quoted string that
6669
// contains variables.
6770
StackFrame::DoubleQuote => tokens.extend(self.double_quote(&mut state)?),
68-
StackFrame::Heredoc(label) => {
71+
// The doc string state is entered when tokenizing heredocs and nowdocs.
72+
StackFrame::DocString(kind, label) => {
73+
let kind = kind.clone();
6974
let label = label.clone();
7075

71-
tokens.extend(self.heredoc(&mut state, label)?)
76+
tokens.extend(self.docstring(&mut state, kind, label)?)
7277
}
7378
// LookingForProperty is entered inside double quotes,
7479
// backticks, or a heredoc, expecting a variable name.
@@ -512,14 +517,38 @@ impl Lexer {
512517
state.next();
513518
TokenKind::Minus
514519
}
515-
[b'<', b'<', b'<', ident_start!(), ..] => {
520+
[b'<', b'<', b'<', ..] => {
516521
state.skip(3);
517522

523+
self.skip_whitespace(state);
524+
525+
let doc_string_kind = match state.peek_buf() {
526+
[b'\'', ..] => {
527+
state.next();
528+
DocStringKind::Nowdoc
529+
}
530+
_ => DocStringKind::Heredoc,
531+
};
532+
533+
// FIXME: Add support for nowdocs too by checking if a `'`
534+
// character is present before and after the identifier.
518535
let label: ByteString = match self.peek_identifier(state) {
519536
Some(_) => self.consume_identifier(state).into(),
520537
None => unreachable!(),
521538
};
522539

540+
if doc_string_kind == DocStringKind::Nowdoc {
541+
match state.current {
542+
Some(b'\'') => state.next(),
543+
_ => {
544+
return Err(SyntaxError::UnexpectedCharacter(
545+
state.current.unwrap(),
546+
state.span,
547+
))
548+
}
549+
};
550+
}
551+
523552
if !matches!(state.peek_buf(), [b'\n', ..]) {
524553
return Err(SyntaxError::UnexpectedCharacter(
525554
state.current.unwrap(),
@@ -528,9 +557,9 @@ impl Lexer {
528557
}
529558

530559
state.next();
531-
state.set(StackFrame::Heredoc(label.clone()))?;
560+
state.set(StackFrame::DocString(doc_string_kind, label.clone()))?;
532561

533-
TokenKind::StartHeredoc(label)
562+
TokenKind::StartDocString(label, doc_string_kind)
534563
}
535564
[b'<', b'<', b'=', ..] => {
536565
state.skip(3);
@@ -667,25 +696,57 @@ impl Lexer {
667696
Ok(tokens)
668697
}
669698

670-
fn heredoc(&self, state: &mut State, label: ByteString) -> SyntaxResult<Vec<Token>> {
699+
fn docstring(
700+
&self,
701+
state: &mut State,
702+
kind: DocStringKind,
703+
label: ByteString,
704+
) -> SyntaxResult<Vec<Token>> {
671705
let span = state.span;
672706
let mut buffer = Vec::new();
673-
// FIXME: We need to track whitespace amount here. It's a bit painful, so skipping for now
674-
// so we can find other things to fix first.
707+
let mut new_line = false;
708+
709+
let mut indentation_type: Option<DocStringIndentationType> = None;
710+
let mut indentation_amount: usize = 0;
711+
712+
// 1. Check if there's any whitespace here. It can either be a space or tab character.
713+
if matches!(state.peek_buf(), [b' ' | b'\t', ..]) {
714+
indentation_type = Some(DocStringIndentationType::from(state.current.unwrap()));
715+
}
716+
717+
// 2. Count how much whitespace there is on this line.
718+
if let Some(indentation_type) = indentation_type {
719+
loop {
720+
match (indentation_type, state.peek_buf()) {
721+
(DocStringIndentationType::Space, [b' ', ..]) => {
722+
indentation_amount += 1;
723+
state.next();
724+
buffer.push(b' ');
725+
}
726+
(DocStringIndentationType::Tab, [b'\t', ..]) => {
727+
indentation_amount += 1;
728+
state.next();
729+
buffer.push(b'\t');
730+
}
731+
_ => break,
732+
};
733+
}
734+
}
735+
675736
let kind = loop {
676737
match state.peek_buf() {
677-
[b'$', b'{', ..] => {
738+
[b'$', b'{', ..] if kind == DocStringKind::Heredoc => {
678739
state.skip(2);
679740
state.enter(StackFrame::LookingForVarname);
680741
break TokenKind::DollarLeftBrace;
681742
}
682-
[b'{', b'$', ..] => {
743+
[b'{', b'$', ..] if kind == DocStringKind::Heredoc => {
683744
// Intentionally only consume the left brace.
684745
state.next();
685746
state.enter(StackFrame::Scripting);
686747
break TokenKind::LeftBrace;
687748
}
688-
[b'$', ident_start!(), ..] => {
749+
[b'$', ident_start!(), ..] if kind == DocStringKind::Heredoc => {
689750
state.next();
690751
let ident = self.consume_identifier(state);
691752

@@ -700,23 +761,148 @@ impl Lexer {
700761

701762
break TokenKind::Variable(ident.into());
702763
}
764+
&[b'\n', ..] => {
765+
new_line = true;
766+
state.next();
767+
buffer.push(b'\n');
768+
}
703769
&[b, ..] => {
704-
// FIXME: Hacky.
705-
// If the last character we parsed was a line break, we'll know we're at the start of a new line
706-
// where the closing heredoc label might be found.
707-
if matches!(buffer.last(), Some(b'\n')) && state.try_read(&label.0) {
770+
// If we're not on a new line, just add to the buffer as usual.
771+
if !new_line {
772+
new_line = false;
773+
state.next();
774+
buffer.push(b);
775+
continue;
776+
}
777+
778+
// If we can see the label here, we can consume it and exit early.
779+
if state.try_read(&label) {
708780
state.skip(label.len());
709781
state.set(StackFrame::Scripting)?;
710-
break TokenKind::EndHeredoc(label);
782+
break TokenKind::EndDocString(label, None, 0);
711783
}
712784

713-
state.next();
714-
buffer.push(b);
785+
// We know the label isn't at the start of the line, so we can
786+
// check if the line starts with any whitespace.
787+
let line_starts_with_whitespace =
788+
matches!(state.peek_buf(), [b' ' | b'\t', ..]);
789+
let mut current_indentation_amount = 0;
790+
791+
// If the line does start with whitespace, let's figure out what the current
792+
// indentation type is and how much whitespace there is.
793+
if line_starts_with_whitespace {
794+
let current_indentation_type;
795+
796+
match state.peek_buf() {
797+
[b' ', ..] => {
798+
current_indentation_type = DocStringIndentationType::Space;
799+
}
800+
[b'\t', ..] => {
801+
current_indentation_type = DocStringIndentationType::Tab;
802+
}
803+
_ => unreachable!(),
804+
};
805+
806+
// If there was indentation on a previous line, we need to check
807+
// if the current indentation type is the same or different.
808+
// If it's different, we need to produce an error.
809+
if let Some(indentation_type) = indentation_type {
810+
if indentation_type != current_indentation_type {
811+
return Err(SyntaxError::InvalidDocIndentation(state.span));
812+
}
813+
}
814+
815+
let mut leading_whitespace_buffer = Vec::new();
816+
817+
// If the type of whitespace is the same, we want to know
818+
// how much whitespace is on this line. We only care about
819+
// the smallest amount of whitespace in this case.
820+
loop {
821+
match (current_indentation_type, state.peek_buf()) {
822+
(DocStringIndentationType::Space, [b' ', ..]) => {
823+
leading_whitespace_buffer.push(b' ');
824+
current_indentation_amount += 1;
825+
state.next();
826+
}
827+
(DocStringIndentationType::Tab, [b'\t', ..]) => {
828+
leading_whitespace_buffer.push(b'\t');
829+
current_indentation_amount += 1;
830+
state.next();
831+
}
832+
_ => break,
833+
};
834+
}
835+
836+
// If we can read the label at this point, we then need to check if the amount
837+
// of indentation is the same or less than the smallest amount encountered thus far.
838+
if state.try_read(&label) && current_indentation_amount > indentation_amount
839+
{
840+
return Err(SyntaxError::InvalidDocBodyIndentationLevel(
841+
current_indentation_amount,
842+
state.span,
843+
));
844+
}
845+
846+
// If we've found less whitespace here, we should update the minimum.
847+
if current_indentation_amount < indentation_amount {
848+
indentation_amount = current_indentation_amount;
849+
}
850+
851+
let mut whitespace_buffer = Vec::new();
852+
853+
// We should now try to consume anymore whitespace, since the doc body
854+
// can include spaces or tabs. We should also push it to the buffer,
855+
// in case we don't encounter the label. In theory, the only whitespace
856+
// we'll encounter here is the character not found by the checks above.
857+
loop {
858+
match state.peek_buf() {
859+
[b @ b' ' | b @ b'\t', ..] => {
860+
whitespace_buffer.push(b.clone());
861+
state.next();
862+
}
863+
_ => break,
864+
}
865+
}
866+
867+
// Check if we can read the label again now.
868+
if state.try_read(&label) {
869+
// If there was extra whitespace after indentation, we need
870+
// to error out about mixed indentation types.
871+
if !whitespace_buffer.is_empty() {
872+
return Err(SyntaxError::InvalidDocIndentation(state.span));
873+
}
874+
875+
// If no extra whitespace was found, we've reached the end of the heredoc
876+
// and can consume the label, sending the indentation amount along to the parser
877+
// to normalize.
878+
state.skip(label.len());
879+
state.set(StackFrame::Scripting)?;
880+
break TokenKind::EndDocString(
881+
label,
882+
indentation_type,
883+
current_indentation_amount,
884+
);
885+
} else {
886+
buffer.extend(leading_whitespace_buffer);
887+
buffer.extend(whitespace_buffer);
888+
continue;
889+
}
890+
} else {
891+
new_line = false;
892+
state.next();
893+
buffer.push(b);
894+
}
715895
}
716896
[] => return Err(SyntaxError::UnexpectedEndOfFile(state.span)),
717897
}
718898
};
719899

900+
// Trailing line breaks in the last segment of a heredoc
901+
// shouldn't end up in the final string.
902+
if buffer.last() == Some(&b'\n') {
903+
buffer.pop();
904+
}
905+
720906
let mut tokens = Vec::new();
721907
if !buffer.is_empty() {
722908
tokens.push(Token {

src/lexer/state.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,19 @@ use crate::lexer::error::SyntaxResult;
55
use crate::lexer::token::Span;
66
use crate::prelude::ByteString;
77

8+
#[derive(Debug, PartialEq, Eq, PartialOrd, Clone, Copy)]
9+
pub enum DocStringKind {
10+
Heredoc,
11+
Nowdoc,
12+
}
13+
814
#[derive(Debug, PartialEq, Eq, Clone)]
915
pub enum StackFrame {
1016
Initial,
1117
Scripting,
1218
Halted,
1319
DoubleQuote,
14-
Heredoc(ByteString),
20+
DocString(DocStringKind, ByteString),
1521
LookingForVarname,
1622
LookingForProperty,
1723
VarOffset,
@@ -71,6 +77,10 @@ impl State {
7177
self.chars.get(self.cursor + delta).copied()
7278
}
7379

80+
pub fn peek_len(&self, len: usize) -> &[u8] {
81+
&self.chars[self.cursor..self.cursor + len]
82+
}
83+
7484
pub fn try_read(&self, search: &[u8]) -> bool {
7585
self.peek_buf().starts_with(search)
7686
}

0 commit comments

Comments
 (0)