@@ -20,6 +20,9 @@ use crate::lexer::token::TokenKind;
2020use crate :: ident;
2121use crate :: ident_start;
2222
23+ pub use self :: state:: DocStringKind ;
24+ use self :: token:: DocStringIndentationType ;
25+
2326#[ derive( Debug , PartialEq , Eq , Clone , Copy , Default ) ]
2427pub struct Lexer ;
2528
@@ -65,10 +68,12 @@ impl Lexer {
6568 // The double quote state is entered when inside a double-quoted string that
6669 // contains variables.
6770 StackFrame :: DoubleQuote => tokens. extend ( self . double_quote ( & mut state) ?) ,
68- StackFrame :: Heredoc ( label) => {
71+ // The doc string state is entered when tokenizing heredocs and nowdocs.
72+ StackFrame :: DocString ( kind, label) => {
73+ let kind = kind. clone ( ) ;
6974 let label = label. clone ( ) ;
7075
71- tokens. extend ( self . heredoc ( & mut state, label) ?)
76+ tokens. extend ( self . docstring ( & mut state, kind , label) ?)
7277 }
7378 // LookingForProperty is entered inside double quotes,
7479 // backticks, or a heredoc, expecting a variable name.
@@ -512,14 +517,38 @@ impl Lexer {
512517 state. next ( ) ;
513518 TokenKind :: Minus
514519 }
515- [ b'<' , b'<' , b'<' , ident_start ! ( ) , ..] => {
520+ [ b'<' , b'<' , b'<' , ..] => {
516521 state. skip ( 3 ) ;
517522
523+ self . skip_whitespace ( state) ;
524+
525+ let doc_string_kind = match state. peek_buf ( ) {
526+ [ b'\'' , ..] => {
527+ state. next ( ) ;
528+ DocStringKind :: Nowdoc
529+ }
530+ _ => DocStringKind :: Heredoc ,
531+ } ;
532+
533+ // FIXME: Add support for nowdocs too by checking if a `'`
534+ // character is present before and after the identifier.
518535 let label: ByteString = match self . peek_identifier ( state) {
519536 Some ( _) => self . consume_identifier ( state) . into ( ) ,
520537 None => unreachable ! ( ) ,
521538 } ;
522539
540+ if doc_string_kind == DocStringKind :: Nowdoc {
541+ match state. current {
542+ Some ( b'\'' ) => state. next ( ) ,
543+ _ => {
544+ return Err ( SyntaxError :: UnexpectedCharacter (
545+ state. current . unwrap ( ) ,
546+ state. span ,
547+ ) )
548+ }
549+ } ;
550+ }
551+
523552 if !matches ! ( state. peek_buf( ) , [ b'\n' , ..] ) {
524553 return Err ( SyntaxError :: UnexpectedCharacter (
525554 state. current . unwrap ( ) ,
@@ -528,9 +557,9 @@ impl Lexer {
528557 }
529558
530559 state. next ( ) ;
531- state. set ( StackFrame :: Heredoc ( label. clone ( ) ) ) ?;
560+ state. set ( StackFrame :: DocString ( doc_string_kind , label. clone ( ) ) ) ?;
532561
533- TokenKind :: StartHeredoc ( label)
562+ TokenKind :: StartDocString ( label, doc_string_kind )
534563 }
535564 [ b'<' , b'<' , b'=' , ..] => {
536565 state. skip ( 3 ) ;
@@ -667,25 +696,57 @@ impl Lexer {
667696 Ok ( tokens)
668697 }
669698
670- fn heredoc ( & self , state : & mut State , label : ByteString ) -> SyntaxResult < Vec < Token > > {
699+ fn docstring (
700+ & self ,
701+ state : & mut State ,
702+ kind : DocStringKind ,
703+ label : ByteString ,
704+ ) -> SyntaxResult < Vec < Token > > {
671705 let span = state. span ;
672706 let mut buffer = Vec :: new ( ) ;
673- // FIXME: We need to track whitespace amount here. It's a bit painful, so skipping for now
674- // so we can find other things to fix first.
707+ let mut new_line = false ;
708+
709+ let mut indentation_type: Option < DocStringIndentationType > = None ;
710+ let mut indentation_amount: usize = 0 ;
711+
712+ // 1. Check if there's any whitespace here. It can either be a space or tab character.
713+ if matches ! ( state. peek_buf( ) , [ b' ' | b'\t' , ..] ) {
714+ indentation_type = Some ( DocStringIndentationType :: from ( state. current . unwrap ( ) ) ) ;
715+ }
716+
717+ // 2. Count how much whitespace there is on this line.
718+ if let Some ( indentation_type) = indentation_type {
719+ loop {
720+ match ( indentation_type, state. peek_buf ( ) ) {
721+ ( DocStringIndentationType :: Space , [ b' ' , ..] ) => {
722+ indentation_amount += 1 ;
723+ state. next ( ) ;
724+ buffer. push ( b' ' ) ;
725+ }
726+ ( DocStringIndentationType :: Tab , [ b'\t' , ..] ) => {
727+ indentation_amount += 1 ;
728+ state. next ( ) ;
729+ buffer. push ( b'\t' ) ;
730+ }
731+ _ => break ,
732+ } ;
733+ }
734+ }
735+
675736 let kind = loop {
676737 match state. peek_buf ( ) {
677- [ b'$' , b'{' , ..] => {
738+ [ b'$' , b'{' , ..] if kind == DocStringKind :: Heredoc => {
678739 state. skip ( 2 ) ;
679740 state. enter ( StackFrame :: LookingForVarname ) ;
680741 break TokenKind :: DollarLeftBrace ;
681742 }
682- [ b'{' , b'$' , ..] => {
743+ [ b'{' , b'$' , ..] if kind == DocStringKind :: Heredoc => {
683744 // Intentionally only consume the left brace.
684745 state. next ( ) ;
685746 state. enter ( StackFrame :: Scripting ) ;
686747 break TokenKind :: LeftBrace ;
687748 }
688- [ b'$' , ident_start ! ( ) , ..] => {
749+ [ b'$' , ident_start ! ( ) , ..] if kind == DocStringKind :: Heredoc => {
689750 state. next ( ) ;
690751 let ident = self . consume_identifier ( state) ;
691752
@@ -700,23 +761,148 @@ impl Lexer {
700761
701762 break TokenKind :: Variable ( ident. into ( ) ) ;
702763 }
764+ & [ b'\n' , ..] => {
765+ new_line = true ;
766+ state. next ( ) ;
767+ buffer. push ( b'\n' ) ;
768+ }
703769 & [ b, ..] => {
704- // FIXME: Hacky.
705- // If the last character we parsed was a line break, we'll know we're at the start of a new line
706- // where the closing heredoc label might be found.
707- if matches ! ( buffer. last( ) , Some ( b'\n' ) ) && state. try_read ( & label. 0 ) {
770+ // If we're not on a new line, just add to the buffer as usual.
771+ if !new_line {
772+ new_line = false ;
773+ state. next ( ) ;
774+ buffer. push ( b) ;
775+ continue ;
776+ }
777+
778+ // If we can see the label here, we can consume it and exit early.
779+ if state. try_read ( & label) {
708780 state. skip ( label. len ( ) ) ;
709781 state. set ( StackFrame :: Scripting ) ?;
710- break TokenKind :: EndHeredoc ( label) ;
782+ break TokenKind :: EndDocString ( label, None , 0 ) ;
711783 }
712784
713- state. next ( ) ;
714- buffer. push ( b) ;
785+ // We know the label isn't at the start of the line, so we can
786+ // check if the line starts with any whitespace.
787+ let line_starts_with_whitespace =
788+ matches ! ( state. peek_buf( ) , [ b' ' | b'\t' , ..] ) ;
789+ let mut current_indentation_amount = 0 ;
790+
791+ // If the line does start with whitespace, let's figure out what the current
792+ // indentation type is and how much whitespace there is.
793+ if line_starts_with_whitespace {
794+ let current_indentation_type;
795+
796+ match state. peek_buf ( ) {
797+ [ b' ' , ..] => {
798+ current_indentation_type = DocStringIndentationType :: Space ;
799+ }
800+ [ b'\t' , ..] => {
801+ current_indentation_type = DocStringIndentationType :: Tab ;
802+ }
803+ _ => unreachable ! ( ) ,
804+ } ;
805+
806+ // If there was indentation on a previous line, we need to check
807+ // if the current indentation type is the same or different.
808+ // If it's different, we need to produce an error.
809+ if let Some ( indentation_type) = indentation_type {
810+ if indentation_type != current_indentation_type {
811+ return Err ( SyntaxError :: InvalidDocIndentation ( state. span ) ) ;
812+ }
813+ }
814+
815+ let mut leading_whitespace_buffer = Vec :: new ( ) ;
816+
817+ // If the type of whitespace is the same, we want to know
818+ // how much whitespace is on this line. We only care about
819+ // the smallest amount of whitespace in this case.
820+ loop {
821+ match ( current_indentation_type, state. peek_buf ( ) ) {
822+ ( DocStringIndentationType :: Space , [ b' ' , ..] ) => {
823+ leading_whitespace_buffer. push ( b' ' ) ;
824+ current_indentation_amount += 1 ;
825+ state. next ( ) ;
826+ }
827+ ( DocStringIndentationType :: Tab , [ b'\t' , ..] ) => {
828+ leading_whitespace_buffer. push ( b'\t' ) ;
829+ current_indentation_amount += 1 ;
830+ state. next ( ) ;
831+ }
832+ _ => break ,
833+ } ;
834+ }
835+
836+ // If we can read the label at this point, we then need to check if the amount
837+ // of indentation is the same or less than the smallest amount encountered thus far.
838+ if state. try_read ( & label) && current_indentation_amount > indentation_amount
839+ {
840+ return Err ( SyntaxError :: InvalidDocBodyIndentationLevel (
841+ current_indentation_amount,
842+ state. span ,
843+ ) ) ;
844+ }
845+
846+ // If we've found less whitespace here, we should update the minimum.
847+ if current_indentation_amount < indentation_amount {
848+ indentation_amount = current_indentation_amount;
849+ }
850+
851+ let mut whitespace_buffer = Vec :: new ( ) ;
852+
853+ // We should now try to consume anymore whitespace, since the doc body
854+ // can include spaces or tabs. We should also push it to the buffer,
855+ // in case we don't encounter the label. In theory, the only whitespace
856+ // we'll encounter here is the character not found by the checks above.
857+ loop {
858+ match state. peek_buf ( ) {
859+ [ b @ b' ' | b @ b'\t' , ..] => {
860+ whitespace_buffer. push ( b. clone ( ) ) ;
861+ state. next ( ) ;
862+ }
863+ _ => break ,
864+ }
865+ }
866+
867+ // Check if we can read the label again now.
868+ if state. try_read ( & label) {
869+ // If there was extra whitespace after indentation, we need
870+ // to error out about mixed indentation types.
871+ if !whitespace_buffer. is_empty ( ) {
872+ return Err ( SyntaxError :: InvalidDocIndentation ( state. span ) ) ;
873+ }
874+
875+ // If no extra whitespace was found, we've reached the end of the heredoc
876+ // and can consume the label, sending the indentation amount along to the parser
877+ // to normalize.
878+ state. skip ( label. len ( ) ) ;
879+ state. set ( StackFrame :: Scripting ) ?;
880+ break TokenKind :: EndDocString (
881+ label,
882+ indentation_type,
883+ current_indentation_amount,
884+ ) ;
885+ } else {
886+ buffer. extend ( leading_whitespace_buffer) ;
887+ buffer. extend ( whitespace_buffer) ;
888+ continue ;
889+ }
890+ } else {
891+ new_line = false ;
892+ state. next ( ) ;
893+ buffer. push ( b) ;
894+ }
715895 }
716896 [ ] => return Err ( SyntaxError :: UnexpectedEndOfFile ( state. span ) ) ,
717897 }
718898 } ;
719899
900+ // Trailing line breaks in the last segment of a heredoc
901+ // shouldn't end up in the final string.
902+ if buffer. last ( ) == Some ( & b'\n' ) {
903+ buffer. pop ( ) ;
904+ }
905+
720906 let mut tokens = Vec :: new ( ) ;
721907 if !buffer. is_empty ( ) {
722908 tokens. push ( Token {
0 commit comments