glsl_lang_pp/lexer/
pre.rs

1//! Second stage lexer declaration
2
3use lang_util::TextRange;
4
5use crate::util::{LineMap, Unescaped};
6
7use super::{NewlineSplitter, NewlineToken, NewlineTokenKind};
8
9mod token;
10pub use token::Token;
11use Token::*;
12
13pub type TextToken = crate::util::TextToken<token::Token>;
14
15#[derive(Debug, Clone, Copy, PartialEq)]
16enum State {
17    /// Initial state of the preprocessor
18    Init,
19    /// Possibly a start of comment
20    Slash,
21    /// Line continuation character seen
22    Backslash,
23    /// Building an identifier
24    Ident,
25    /// Building a digit sequence
26    Digits { seen_e: bool, seen_dot: bool },
27    /// Single-line comment
28    SingleComment,
29    /// Multi-line comment
30    MultiComment,
31    /// Multi-line comment, saw a *
32    MultiCommentStar,
33    /// Any kind of horizontal whitespace
34    Whitespace,
35    /// Inside a quote string
36    QuoteString,
37    /// Insite an angle string
38    AngleString,
39}
40
41impl Default for State {
42    fn default() -> Self {
43        Self::Init
44    }
45}
46
47/// A lexer for early lexical analysis stages.
48///
49/// This lexer does the following:
50/// * Assemble digit sequences into single tokens
51/// * Assemble identifier characters into single tokens
52/// * Eliminate backslash-escaped newlines
53/// * Identify single and multi-line comments
54/// * Tokenize double-quoted strings and (when asked to) angle-quoted strings
55#[derive(Debug, Clone)]
56pub struct PreLexer<'i> {
57    source: &'i str,
58    input: NewlineSplitter<'i>,
59    peeked: Option<Option<NewlineToken>>,
60    state: State,
61    start: TextRange,
62    return_to: State,
63    expect_angle_string: bool,
64}
65
66impl<'i> PreLexer<'i> {
67    pub fn new(input: &'i str) -> Self {
68        Self {
69            source: input,
70            input: NewlineSplitter::new(input),
71            state: Default::default(),
72            start: Default::default(),
73            return_to: Default::default(),
74            peeked: None,
75            expect_angle_string: false,
76        }
77    }
78
79    pub fn input(&self) -> &'i str {
80        self.source
81    }
82
83    pub fn line_map(&self) -> &LineMap {
84        self.input.line_map()
85    }
86
87    pub fn into_line_map(self) -> LineMap {
88        self.input.into_line_map()
89    }
90
91    pub fn set_expect_angle_string(&mut self, expect_angle_string: bool) {
92        self.expect_angle_string = expect_angle_string;
93    }
94
95    fn peek_token(&mut self) -> Option<(NewlineToken, &'i str)> {
96        self.peeked
97            .unwrap_or_else(|| {
98                let next = self.input.next();
99                self.peeked = Some(next);
100                next
101            })
102            .map(move |token| (token, token.raw(self.source)))
103    }
104
105    fn next_token(&mut self) -> Option<NewlineToken> {
106        let result = if let Some(token) = self.peeked.take() {
107            token
108        } else {
109            self.input.next()
110        };
111
112        // Expand the current token
113        if let Some(token) = &result {
114            self.start = TextRange::new(self.start.start(), token.range.end());
115        }
116
117        result
118    }
119}
120
121impl<'i> Iterator for PreLexer<'i> {
122    type Item = TextToken;
123
124    fn next(&mut self) -> Option<Self::Item> {
125        loop {
126            self.state = match std::mem::take(&mut self.state) {
127                State::Init => {
128                    let c = self.next_token();
129
130                    // All the following states need a recorded start point
131                    if let Some(t) = &c {
132                        self.start = t.range;
133                    }
134
135                    match c {
136                        Some(NewlineToken {
137                            token: NewlineTokenKind::LETTER,
138                            ..
139                        }) => {
140                            // Start an identifier
141                            State::Ident
142                        }
143                        Some(NewlineToken {
144                            token: NewlineTokenKind::DIGIT,
145                            ..
146                        }) => {
147                            // Start a digit sequence
148                            State::Digits {
149                                seen_e: false,
150                                seen_dot: false,
151                            }
152                        }
153                        Some(NewlineToken {
154                            token: NewlineTokenKind::PUNCT,
155                            range,
156                        }) => {
157                            let t = c.unwrap();
158                            let text = t.raw(self.source);
159
160                            match text {
161                                "\"" => {
162                                    // Also clear the string flag, since it should've started with
163                                    // < instead
164                                    self.expect_angle_string = false;
165                                    State::QuoteString
166                                }
167                                "<" if self.expect_angle_string => {
168                                    // Clear the string flag
169                                    self.expect_angle_string = false;
170                                    State::AngleString
171                                }
172                                "\\" => {
173                                    self.return_to = State::Init;
174                                    State::Backslash
175                                }
176                                "/" => State::Slash,
177                                "_" => State::Ident,
178                                _ => {
179                                    // Punctuation
180                                    return Some(TextToken {
181                                        token: Token::from_punct(text),
182                                        range,
183                                    });
184                                }
185                            }
186                        }
187                        Some(NewlineToken {
188                            token: NewlineTokenKind::NEWLINE,
189                            range,
190                        }) => {
191                            // A newline, this completes a potential #include
192                            self.expect_angle_string = false;
193
194                            // A newline
195                            return Some(TextToken {
196                                token: NEWLINE,
197                                range,
198                            });
199                        }
200                        Some(NewlineToken {
201                            token: NewlineTokenKind::WS,
202                            ..
203                        }) => State::Whitespace,
204                        None => {
205                            return None;
206                        }
207                    }
208                }
209
210                State::Slash => {
211                    match self.peek_token() {
212                        Some((_, "/")) => {
213                            self.next_token();
214                            State::SingleComment
215                        }
216                        Some((_, "*")) => {
217                            self.next_token();
218                            State::MultiComment
219                        }
220                        _ => {
221                            // Another char or EOI, so we saw a '/' followed by something else
222                            // Emit the '/' and then we'll reparse the char next round
223                            return Some(TextToken::new(SLASH, self.start));
224                        }
225                    }
226                }
227
228                State::Backslash => {
229                    // Either there's a newline and we should skip it, or there's something else
230                    // and we pass the backslash forward
231                    match self.peek_token() {
232                        Some((
233                            NewlineToken {
234                                token: NewlineTokenKind::NEWLINE,
235                                ..
236                            },
237                            _,
238                        )) => {
239                            self.next_token();
240                            if self.return_to == State::Init {
241                                // This line continuation is included nowhere, so we should emit it
242                                return Some(TextToken::new(LINECONT, self.start));
243                            } else {
244                                // This line continuation is part of some other token
245                                self.return_to
246                            }
247                        }
248                        _ => {
249                            if self.return_to == State::SingleComment
250                                || self.return_to == State::MultiComment
251                            {
252                                // In a comment, consume the backslash, don't emit it
253                                self.next_token();
254                                self.return_to
255                            } else {
256                                return Some(TextToken::new(BACKSLASH, self.start));
257                            }
258                        }
259                    }
260                }
261
262                State::Ident => {
263                    match self.peek_token() {
264                        Some((
265                            NewlineToken {
266                                token: NewlineTokenKind::LETTER,
267                                ..
268                            },
269                            _,
270                        ))
271                        | Some((
272                            NewlineToken {
273                                token: NewlineTokenKind::DIGIT,
274                                ..
275                            },
276                            _,
277                        ))
278                        | Some((
279                            NewlineToken {
280                                token: NewlineTokenKind::PUNCT,
281                                ..
282                            },
283                            "_",
284                        )) => {
285                            // Continue the ident
286                            self.next_token();
287                            State::Ident
288                        }
289                        Some((_, "\\")) => {
290                            self.next_token();
291                            self.return_to = State::Ident;
292                            State::Backslash
293                        }
294                        _ => {
295                            // Not an ident anymore, return the ident
296                            let token = TextToken::new(Token::IDENT_KW, self.start);
297
298                            // Check if IDENT_KW is the defined keyword
299                            if Unescaped::new(token.raw(self.source)) == "defined" {
300                                return Some(TextToken::new(Token::DEFINED, self.start));
301                            }
302
303                            return Some(token);
304                        }
305                    }
306                }
307
308                State::Digits { seen_e, seen_dot } => {
309                    match self.peek_token() {
310                        Some((
311                            NewlineToken {
312                                token: NewlineTokenKind::DIGIT,
313                                ..
314                            },
315                            _,
316                        )) => {
317                            self.next_token();
318                            State::Digits { seen_e, seen_dot }
319                        }
320                        Some((
321                            NewlineToken {
322                                token: NewlineTokenKind::PUNCT,
323                                ..
324                            },
325                            punct,
326                        )) if (!seen_dot && punct == ".")
327                            || seen_e && (punct == "+" || punct == "-") =>
328                        {
329                            self.next_token();
330                            State::Digits {
331                                seen_e,
332                                seen_dot: punct == "." || seen_dot,
333                            }
334                        }
335                        Some((
336                            NewlineToken {
337                                token: NewlineTokenKind::LETTER,
338                                ..
339                            },
340                            ch,
341                        )) => {
342                            self.next_token();
343                            State::Digits {
344                                seen_e: ch == "e" || ch == "E",
345                                seen_dot,
346                            }
347                        }
348                        Some((_, "\\")) => {
349                            self.next_token();
350                            self.return_to = State::Digits { seen_e, seen_dot };
351                            State::Backslash
352                        }
353                        _ => {
354                            // Not a digit sequence anymore, return the digit sequence
355                            return Some(TextToken::new(Token::DIGITS, self.start));
356                        }
357                    }
358                }
359
360                State::SingleComment => {
361                    match self.peek_token() {
362                        Some((
363                            NewlineToken {
364                                token: NewlineTokenKind::NEWLINE,
365                                ..
366                            },
367                            _,
368                        ))
369                        | None => {
370                            // Do not eat the newline yet
371                            return Some(TextToken::new(Token::COMMENT, self.start));
372                        }
373                        Some((_, "\\")) => {
374                            self.next_token();
375                            self.return_to = State::SingleComment;
376                            State::Backslash
377                        }
378                        _ => {
379                            // Any other char
380                            self.next_token();
381                            State::SingleComment
382                        }
383                    }
384                }
385
386                State::MultiComment => {
387                    match self.peek_token() {
388                        Some((_, "*")) => {
389                            self.next_token();
390                            State::MultiCommentStar
391                        }
392                        None => {
393                            // Unfinished comment
394                            return Some(TextToken::new(Token::ERROR, self.start));
395                        }
396                        _ => {
397                            // Any other char
398                            self.next_token();
399                            State::MultiComment
400                        }
401                    }
402                }
403
404                State::MultiCommentStar => {
405                    match self.peek_token() {
406                        Some((_, "/")) => {
407                            self.next_token();
408                            return Some(TextToken::new(Token::COMMENT, self.start));
409                        }
410                        None => {
411                            // Unfinished comment
412                            return Some(TextToken::new(Token::ERROR, self.start));
413                        }
414                        _ => {
415                            // Any other char
416                            self.next_token();
417                            State::MultiComment
418                        }
419                    }
420                }
421
422                State::Whitespace => {
423                    if self
424                        .peek_token()
425                        .map(|(token, _)| token.token == NewlineTokenKind::WS)
426                        .unwrap_or(false)
427                    {
428                        // More whitespace
429                        self.next_token();
430                        State::Whitespace
431                    } else {
432                        // No more whitespace
433                        return Some(TextToken::new(WS, self.start));
434                    }
435                }
436
437                State::QuoteString => {
438                    if let Some((token, text)) = self.peek_token() {
439                        // Release text borrow
440                        let end_quote = text == "\"";
441
442                        // Always consume the token
443                        self.next_token();
444
445                        // Extend the range
446                        self.start = TextRange::new(self.start.start(), token.range.end());
447
448                        if end_quote {
449                            return Some(TextToken::new(QUOTE_STRING, self.start));
450                        } else {
451                            State::QuoteString
452                        }
453                    } else {
454                        // No more tokens, bump an error
455                        return Some(TextToken::new(ERROR, self.start));
456                    }
457                }
458
459                State::AngleString => {
460                    if let Some((token, text)) = self.peek_token() {
461                        // Release text borrow
462                        let end_quote = text == ">";
463
464                        // Always consume the token
465                        self.next_token();
466
467                        // Extend the range
468                        self.start = TextRange::new(self.start.start(), token.range.end());
469
470                        if end_quote {
471                            return Some(TextToken::new(ANGLE_STRING, self.start));
472                        } else {
473                            State::AngleString
474                        }
475                    } else {
476                        // No more tokens, bump an error
477                        return Some(TextToken::new(ERROR, self.start));
478                    }
479                }
480            };
481        }
482    }
483}