glsl_lang_pp/lexer/pre.rs
1//! Second stage lexer declaration
2
3use lang_util::TextRange;
4
5use crate::util::{LineMap, Unescaped};
6
7use super::{NewlineSplitter, NewlineToken, NewlineTokenKind};
8
9mod token;
10pub use token::Token;
11use Token::*;
12
13pub type TextToken = crate::util::TextToken<token::Token>;
14
15#[derive(Debug, Clone, Copy, PartialEq)]
16enum State {
17 /// Initial state of the preprocessor
18 Init,
19 /// Possibly a start of comment
20 Slash,
21 /// Line continuation character seen
22 Backslash,
23 /// Building an identifier
24 Ident,
25 /// Building a digit sequence
26 Digits { seen_e: bool, seen_dot: bool },
27 /// Single-line comment
28 SingleComment,
29 /// Multi-line comment
30 MultiComment,
31 /// Multi-line comment, saw a *
32 MultiCommentStar,
33 /// Any kind of horizontal whitespace
34 Whitespace,
35 /// Inside a quote string
36 QuoteString,
37 /// Insite an angle string
38 AngleString,
39}
40
41impl Default for State {
42 fn default() -> Self {
43 Self::Init
44 }
45}
46
47/// A lexer for early lexical analysis stages.
48///
49/// This lexer does the following:
50/// * Assemble digit sequences into single tokens
51/// * Assemble identifier characters into single tokens
52/// * Eliminate backslash-escaped newlines
53/// * Identify single and multi-line comments
54/// * Tokenize double-quoted strings and (when asked to) angle-quoted strings
55#[derive(Debug, Clone)]
56pub struct PreLexer<'i> {
57 source: &'i str,
58 input: NewlineSplitter<'i>,
59 peeked: Option<Option<NewlineToken>>,
60 state: State,
61 start: TextRange,
62 return_to: State,
63 expect_angle_string: bool,
64}
65
66impl<'i> PreLexer<'i> {
67 pub fn new(input: &'i str) -> Self {
68 Self {
69 source: input,
70 input: NewlineSplitter::new(input),
71 state: Default::default(),
72 start: Default::default(),
73 return_to: Default::default(),
74 peeked: None,
75 expect_angle_string: false,
76 }
77 }
78
79 pub fn input(&self) -> &'i str {
80 self.source
81 }
82
83 pub fn line_map(&self) -> &LineMap {
84 self.input.line_map()
85 }
86
87 pub fn into_line_map(self) -> LineMap {
88 self.input.into_line_map()
89 }
90
91 pub fn set_expect_angle_string(&mut self, expect_angle_string: bool) {
92 self.expect_angle_string = expect_angle_string;
93 }
94
95 fn peek_token(&mut self) -> Option<(NewlineToken, &'i str)> {
96 self.peeked
97 .unwrap_or_else(|| {
98 let next = self.input.next();
99 self.peeked = Some(next);
100 next
101 })
102 .map(move |token| (token, token.raw(self.source)))
103 }
104
105 fn next_token(&mut self) -> Option<NewlineToken> {
106 let result = if let Some(token) = self.peeked.take() {
107 token
108 } else {
109 self.input.next()
110 };
111
112 // Expand the current token
113 if let Some(token) = &result {
114 self.start = TextRange::new(self.start.start(), token.range.end());
115 }
116
117 result
118 }
119}
120
121impl<'i> Iterator for PreLexer<'i> {
122 type Item = TextToken;
123
124 fn next(&mut self) -> Option<Self::Item> {
125 loop {
126 self.state = match std::mem::take(&mut self.state) {
127 State::Init => {
128 let c = self.next_token();
129
130 // All the following states need a recorded start point
131 if let Some(t) = &c {
132 self.start = t.range;
133 }
134
135 match c {
136 Some(NewlineToken {
137 token: NewlineTokenKind::LETTER,
138 ..
139 }) => {
140 // Start an identifier
141 State::Ident
142 }
143 Some(NewlineToken {
144 token: NewlineTokenKind::DIGIT,
145 ..
146 }) => {
147 // Start a digit sequence
148 State::Digits {
149 seen_e: false,
150 seen_dot: false,
151 }
152 }
153 Some(NewlineToken {
154 token: NewlineTokenKind::PUNCT,
155 range,
156 }) => {
157 let t = c.unwrap();
158 let text = t.raw(self.source);
159
160 match text {
161 "\"" => {
162 // Also clear the string flag, since it should've started with
163 // < instead
164 self.expect_angle_string = false;
165 State::QuoteString
166 }
167 "<" if self.expect_angle_string => {
168 // Clear the string flag
169 self.expect_angle_string = false;
170 State::AngleString
171 }
172 "\\" => {
173 self.return_to = State::Init;
174 State::Backslash
175 }
176 "/" => State::Slash,
177 "_" => State::Ident,
178 _ => {
179 // Punctuation
180 return Some(TextToken {
181 token: Token::from_punct(text),
182 range,
183 });
184 }
185 }
186 }
187 Some(NewlineToken {
188 token: NewlineTokenKind::NEWLINE,
189 range,
190 }) => {
191 // A newline, this completes a potential #include
192 self.expect_angle_string = false;
193
194 // A newline
195 return Some(TextToken {
196 token: NEWLINE,
197 range,
198 });
199 }
200 Some(NewlineToken {
201 token: NewlineTokenKind::WS,
202 ..
203 }) => State::Whitespace,
204 None => {
205 return None;
206 }
207 }
208 }
209
210 State::Slash => {
211 match self.peek_token() {
212 Some((_, "/")) => {
213 self.next_token();
214 State::SingleComment
215 }
216 Some((_, "*")) => {
217 self.next_token();
218 State::MultiComment
219 }
220 _ => {
221 // Another char or EOI, so we saw a '/' followed by something else
222 // Emit the '/' and then we'll reparse the char next round
223 return Some(TextToken::new(SLASH, self.start));
224 }
225 }
226 }
227
228 State::Backslash => {
229 // Either there's a newline and we should skip it, or there's something else
230 // and we pass the backslash forward
231 match self.peek_token() {
232 Some((
233 NewlineToken {
234 token: NewlineTokenKind::NEWLINE,
235 ..
236 },
237 _,
238 )) => {
239 self.next_token();
240 if self.return_to == State::Init {
241 // This line continuation is included nowhere, so we should emit it
242 return Some(TextToken::new(LINECONT, self.start));
243 } else {
244 // This line continuation is part of some other token
245 self.return_to
246 }
247 }
248 _ => {
249 if self.return_to == State::SingleComment
250 || self.return_to == State::MultiComment
251 {
252 // In a comment, consume the backslash, don't emit it
253 self.next_token();
254 self.return_to
255 } else {
256 return Some(TextToken::new(BACKSLASH, self.start));
257 }
258 }
259 }
260 }
261
262 State::Ident => {
263 match self.peek_token() {
264 Some((
265 NewlineToken {
266 token: NewlineTokenKind::LETTER,
267 ..
268 },
269 _,
270 ))
271 | Some((
272 NewlineToken {
273 token: NewlineTokenKind::DIGIT,
274 ..
275 },
276 _,
277 ))
278 | Some((
279 NewlineToken {
280 token: NewlineTokenKind::PUNCT,
281 ..
282 },
283 "_",
284 )) => {
285 // Continue the ident
286 self.next_token();
287 State::Ident
288 }
289 Some((_, "\\")) => {
290 self.next_token();
291 self.return_to = State::Ident;
292 State::Backslash
293 }
294 _ => {
295 // Not an ident anymore, return the ident
296 let token = TextToken::new(Token::IDENT_KW, self.start);
297
298 // Check if IDENT_KW is the defined keyword
299 if Unescaped::new(token.raw(self.source)) == "defined" {
300 return Some(TextToken::new(Token::DEFINED, self.start));
301 }
302
303 return Some(token);
304 }
305 }
306 }
307
308 State::Digits { seen_e, seen_dot } => {
309 match self.peek_token() {
310 Some((
311 NewlineToken {
312 token: NewlineTokenKind::DIGIT,
313 ..
314 },
315 _,
316 )) => {
317 self.next_token();
318 State::Digits { seen_e, seen_dot }
319 }
320 Some((
321 NewlineToken {
322 token: NewlineTokenKind::PUNCT,
323 ..
324 },
325 punct,
326 )) if (!seen_dot && punct == ".")
327 || seen_e && (punct == "+" || punct == "-") =>
328 {
329 self.next_token();
330 State::Digits {
331 seen_e,
332 seen_dot: punct == "." || seen_dot,
333 }
334 }
335 Some((
336 NewlineToken {
337 token: NewlineTokenKind::LETTER,
338 ..
339 },
340 ch,
341 )) => {
342 self.next_token();
343 State::Digits {
344 seen_e: ch == "e" || ch == "E",
345 seen_dot,
346 }
347 }
348 Some((_, "\\")) => {
349 self.next_token();
350 self.return_to = State::Digits { seen_e, seen_dot };
351 State::Backslash
352 }
353 _ => {
354 // Not a digit sequence anymore, return the digit sequence
355 return Some(TextToken::new(Token::DIGITS, self.start));
356 }
357 }
358 }
359
360 State::SingleComment => {
361 match self.peek_token() {
362 Some((
363 NewlineToken {
364 token: NewlineTokenKind::NEWLINE,
365 ..
366 },
367 _,
368 ))
369 | None => {
370 // Do not eat the newline yet
371 return Some(TextToken::new(Token::COMMENT, self.start));
372 }
373 Some((_, "\\")) => {
374 self.next_token();
375 self.return_to = State::SingleComment;
376 State::Backslash
377 }
378 _ => {
379 // Any other char
380 self.next_token();
381 State::SingleComment
382 }
383 }
384 }
385
386 State::MultiComment => {
387 match self.peek_token() {
388 Some((_, "*")) => {
389 self.next_token();
390 State::MultiCommentStar
391 }
392 None => {
393 // Unfinished comment
394 return Some(TextToken::new(Token::ERROR, self.start));
395 }
396 _ => {
397 // Any other char
398 self.next_token();
399 State::MultiComment
400 }
401 }
402 }
403
404 State::MultiCommentStar => {
405 match self.peek_token() {
406 Some((_, "/")) => {
407 self.next_token();
408 return Some(TextToken::new(Token::COMMENT, self.start));
409 }
410 None => {
411 // Unfinished comment
412 return Some(TextToken::new(Token::ERROR, self.start));
413 }
414 _ => {
415 // Any other char
416 self.next_token();
417 State::MultiComment
418 }
419 }
420 }
421
422 State::Whitespace => {
423 if self
424 .peek_token()
425 .map(|(token, _)| token.token == NewlineTokenKind::WS)
426 .unwrap_or(false)
427 {
428 // More whitespace
429 self.next_token();
430 State::Whitespace
431 } else {
432 // No more whitespace
433 return Some(TextToken::new(WS, self.start));
434 }
435 }
436
437 State::QuoteString => {
438 if let Some((token, text)) = self.peek_token() {
439 // Release text borrow
440 let end_quote = text == "\"";
441
442 // Always consume the token
443 self.next_token();
444
445 // Extend the range
446 self.start = TextRange::new(self.start.start(), token.range.end());
447
448 if end_quote {
449 return Some(TextToken::new(QUOTE_STRING, self.start));
450 } else {
451 State::QuoteString
452 }
453 } else {
454 // No more tokens, bump an error
455 return Some(TextToken::new(ERROR, self.start));
456 }
457 }
458
459 State::AngleString => {
460 if let Some((token, text)) = self.peek_token() {
461 // Release text borrow
462 let end_quote = text == ">";
463
464 // Always consume the token
465 self.next_token();
466
467 // Extend the range
468 self.start = TextRange::new(self.start.start(), token.range.end());
469
470 if end_quote {
471 return Some(TextToken::new(ANGLE_STRING, self.start));
472 } else {
473 State::AngleString
474 }
475 } else {
476 // No more tokens, bump an error
477 return Some(TextToken::new(ERROR, self.start));
478 }
479 }
480 };
481 }
482 }
483}