Skip to content
This repository was archived by the owner on Jul 24, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions meta/dump
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env sh

set -xe

cargo run --bin php-parser-rs -- $1
5 changes: 5 additions & 0 deletions meta/snapshot
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env sh

set -xe

cargo run --bin snapshot
8 changes: 7 additions & 1 deletion src/lexer/byte_string.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::cmp::{Eq, PartialEq};
use std::fmt::{Debug, Display, Formatter, Result};
use std::ops::Deref;
use std::ops::{Deref, DerefMut};
use std::str::from_utf8;

/// A wrapper for Vec<u8> that provides a human-readable Debug impl and
Expand Down Expand Up @@ -98,6 +98,12 @@ impl Deref for ByteString {
}
}

impl DerefMut for ByteString {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
13 changes: 13 additions & 0 deletions src/lexer/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ pub enum SyntaxError {
InvalidOctalLiteral(Span),
InvalidUnicodeEscape(Span),
UnpredictableState(Span),
InvalidDocIndentation(Span),
InvalidDocBodyIndentationLevel(usize, Span),
}

impl Display for SyntaxError {
Expand Down Expand Up @@ -59,6 +61,17 @@ impl Display for SyntaxError {
"Syntax Error: Reached an unpredictable state on line {} column {}",
span.0, span.1
),
Self::InvalidDocIndentation(span) => write!(
f,
"Syntax Error: Invalid indentation - cannot use tabs and spaces on line {}",
span.0
),
Self::InvalidDocBodyIndentationLevel(expected, span) => write!(
f,
"Syntax Error: Invalid body indentation level - expecting an indentation level of at least {} on line {}",
expected,
span.0
),
}
}
}
222 changes: 204 additions & 18 deletions src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ use crate::lexer::token::TokenKind;
use crate::ident;
use crate::ident_start;

pub use self::state::DocStringKind;
use self::token::DocStringIndentationType;

#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)]
pub struct Lexer;

Expand Down Expand Up @@ -65,10 +68,12 @@ impl Lexer {
// The double quote state is entered when inside a double-quoted string that
// contains variables.
StackFrame::DoubleQuote => tokens.extend(self.double_quote(&mut state)?),
StackFrame::Heredoc(label) => {
// The doc string state is entered when tokenizing heredocs and nowdocs.
StackFrame::DocString(kind, label) => {
let kind = kind.clone();
let label = label.clone();

tokens.extend(self.heredoc(&mut state, label)?)
tokens.extend(self.docstring(&mut state, kind, label)?)
}
// LookingForProperty is entered inside double quotes,
// backticks, or a heredoc, expecting a variable name.
Expand Down Expand Up @@ -512,14 +517,38 @@ impl Lexer {
state.next();
TokenKind::Minus
}
[b'<', b'<', b'<', ident_start!(), ..] => {
[b'<', b'<', b'<', ..] => {
state.skip(3);

self.skip_whitespace(state);

let doc_string_kind = match state.peek_buf() {
[b'\'', ..] => {
state.next();
DocStringKind::Nowdoc
}
_ => DocStringKind::Heredoc,
};

// FIXME: Add support for nowdocs too by checking if a `'`
// character is present before and after the identifier.
let label: ByteString = match self.peek_identifier(state) {
Some(_) => self.consume_identifier(state).into(),
None => unreachable!(),
};

if doc_string_kind == DocStringKind::Nowdoc {
match state.current {
Some(b'\'') => state.next(),
_ => {
return Err(SyntaxError::UnexpectedCharacter(
state.current.unwrap(),
state.span,
))
}
};
}

if !matches!(state.peek_buf(), [b'\n', ..]) {
return Err(SyntaxError::UnexpectedCharacter(
state.current.unwrap(),
Expand All @@ -528,9 +557,9 @@ impl Lexer {
}

state.next();
state.set(StackFrame::Heredoc(label.clone()))?;
state.set(StackFrame::DocString(doc_string_kind, label.clone()))?;

TokenKind::StartHeredoc(label)
TokenKind::StartDocString(label, doc_string_kind)
}
[b'<', b'<', b'=', ..] => {
state.skip(3);
Expand Down Expand Up @@ -667,25 +696,57 @@ impl Lexer {
Ok(tokens)
}

fn heredoc(&self, state: &mut State, label: ByteString) -> SyntaxResult<Vec<Token>> {
fn docstring(
&self,
state: &mut State,
kind: DocStringKind,
label: ByteString,
) -> SyntaxResult<Vec<Token>> {
let span = state.span;
let mut buffer = Vec::new();
// FIXME: We need to track whitespace amount here. It's a bit painful, so skipping for now
// so we can find other things to fix first.
let mut new_line = false;

let mut indentation_type: Option<DocStringIndentationType> = None;
let mut indentation_amount: usize = 0;

// 1. Check if there's any whitespace here. It can either be a space or tab character.
if matches!(state.peek_buf(), [b' ' | b'\t', ..]) {
indentation_type = Some(DocStringIndentationType::from(state.current.unwrap()));
}

// 2. Count how much whitespace there is on this line.
if let Some(indentation_type) = indentation_type {
loop {
match (indentation_type, state.peek_buf()) {
(DocStringIndentationType::Space, [b' ', ..]) => {
indentation_amount += 1;
state.next();
buffer.push(b' ');
}
(DocStringIndentationType::Tab, [b'\t', ..]) => {
indentation_amount += 1;
state.next();
buffer.push(b'\t');
}
_ => break,
};
}
}

let kind = loop {
match state.peek_buf() {
[b'$', b'{', ..] => {
[b'$', b'{', ..] if kind == DocStringKind::Heredoc => {
state.skip(2);
state.enter(StackFrame::LookingForVarname);
break TokenKind::DollarLeftBrace;
}
[b'{', b'$', ..] => {
[b'{', b'$', ..] if kind == DocStringKind::Heredoc => {
// Intentionally only consume the left brace.
state.next();
state.enter(StackFrame::Scripting);
break TokenKind::LeftBrace;
}
[b'$', ident_start!(), ..] => {
[b'$', ident_start!(), ..] if kind == DocStringKind::Heredoc => {
state.next();
let ident = self.consume_identifier(state);

Expand All @@ -700,23 +761,148 @@ impl Lexer {

break TokenKind::Variable(ident.into());
}
&[b'\n', ..] => {
new_line = true;
state.next();
buffer.push(b'\n');
}
&[b, ..] => {
// FIXME: Hacky.
// If the last character we parsed was a line break, we'll know we're at the start of a new line
// where the closing heredoc label might be found.
if matches!(buffer.last(), Some(b'\n')) && state.try_read(&label.0) {
// If we're not on a new line, just add to the buffer as usual.
if !new_line {
new_line = false;
state.next();
buffer.push(b);
continue;
}

// If we can see the label here, we can consume it and exit early.
if state.try_read(&label) {
state.skip(label.len());
state.set(StackFrame::Scripting)?;
break TokenKind::EndHeredoc(label);
break TokenKind::EndDocString(label, None, 0);
}

state.next();
buffer.push(b);
// We know the label isn't at the start of the line, so we can
// check if the line starts with any whitespace.
let line_starts_with_whitespace =
matches!(state.peek_buf(), [b' ' | b'\t', ..]);
let mut current_indentation_amount = 0;

// If the line does start with whitespace, let's figure out what the current
// indentation type is and how much whitespace there is.
if line_starts_with_whitespace {
let current_indentation_type;

match state.peek_buf() {
[b' ', ..] => {
current_indentation_type = DocStringIndentationType::Space;
}
[b'\t', ..] => {
current_indentation_type = DocStringIndentationType::Tab;
}
_ => unreachable!(),
};

// If there was indentation on a previous line, we need to check
// if the current indentation type is the same or different.
// If it's different, we need to produce an error.
if let Some(indentation_type) = indentation_type {
if indentation_type != current_indentation_type {
return Err(SyntaxError::InvalidDocIndentation(state.span));
}
}

let mut leading_whitespace_buffer = Vec::new();

// If the type of whitespace is the same, we want to know
// how much whitespace is on this line. We only care about
// the smallest amount of whitespace in this case.
loop {
match (current_indentation_type, state.peek_buf()) {
(DocStringIndentationType::Space, [b' ', ..]) => {
leading_whitespace_buffer.push(b' ');
current_indentation_amount += 1;
state.next();
}
(DocStringIndentationType::Tab, [b'\t', ..]) => {
leading_whitespace_buffer.push(b'\t');
current_indentation_amount += 1;
state.next();
}
_ => break,
};
}

// If we can read the label at this point, we then need to check if the amount
// of indentation is the same or less than the smallest amount encountered thus far.
if state.try_read(&label) && current_indentation_amount > indentation_amount
{
return Err(SyntaxError::InvalidDocBodyIndentationLevel(
current_indentation_amount,
state.span,
));
}

// If we've found less whitespace here, we should update the minimum.
if current_indentation_amount < indentation_amount {
indentation_amount = current_indentation_amount;
}

let mut whitespace_buffer = Vec::new();

// We should now try to consume anymore whitespace, since the doc body
// can include spaces or tabs. We should also push it to the buffer,
// in case we don't encounter the label. In theory, the only whitespace
// we'll encounter here is the character not found by the checks above.
loop {
match state.peek_buf() {
[b @ b' ' | b @ b'\t', ..] => {
whitespace_buffer.push(b.clone());
state.next();
}
_ => break,
}
}

// Check if we can read the label again now.
if state.try_read(&label) {
// If there was extra whitespace after indentation, we need
// to error out about mixed indentation types.
if !whitespace_buffer.is_empty() {
return Err(SyntaxError::InvalidDocIndentation(state.span));
}

// If no extra whitespace was found, we've reached the end of the heredoc
// and can consume the label, sending the indentation amount along to the parser
// to normalize.
state.skip(label.len());
state.set(StackFrame::Scripting)?;
break TokenKind::EndDocString(
label,
indentation_type,
current_indentation_amount,
);
} else {
buffer.extend(leading_whitespace_buffer);
buffer.extend(whitespace_buffer);
continue;
}
} else {
new_line = false;
state.next();
buffer.push(b);
}
}
[] => return Err(SyntaxError::UnexpectedEndOfFile(state.span)),
}
};

// Trailing line breaks in the last segment of a heredoc
// shouldn't end up in the final string.
if buffer.last() == Some(&b'\n') {
buffer.pop();
}

let mut tokens = Vec::new();
if !buffer.is_empty() {
tokens.push(Token {
Expand Down
12 changes: 11 additions & 1 deletion src/lexer/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,19 @@ use crate::lexer::error::SyntaxResult;
use crate::lexer::token::Span;
use crate::prelude::ByteString;

#[derive(Debug, PartialEq, Eq, PartialOrd, Clone, Copy)]
pub enum DocStringKind {
Heredoc,
Nowdoc,
}

#[derive(Debug, PartialEq, Eq, Clone)]
pub enum StackFrame {
Initial,
Scripting,
Halted,
DoubleQuote,
Heredoc(ByteString),
DocString(DocStringKind, ByteString),
LookingForVarname,
LookingForProperty,
VarOffset,
Expand Down Expand Up @@ -71,6 +77,10 @@ impl State {
self.chars.get(self.cursor + delta).copied()
}

pub fn peek_len(&self, len: usize) -> &[u8] {
&self.chars[self.cursor..self.cursor + len]
}

pub fn try_read(&self, search: &[u8]) -> bool {
self.peek_buf().starts_with(search)
}
Expand Down
Loading