Merge pull request #42 from robotpy/lexer-updates

Significant lexer overhaul
2022-12-15 21:29:32 -05:00 · 2022-12-15 21:29:32 -05:00 · 296272fd39
commit 296272fd39
parent 19c0604603 e5295070a0
10 changed files with 755 additions and 210 deletions
--- a/LICENSE.txt
+++ b/LICENSE.txt
@ -1,6 +1,6 @@
 cxxheaderparser license:
-Copyright (c) 2020 Dustin Spicuzza <dustin@virtualroadside.com>
+Copyright (c) 2020-2022 Dustin Spicuzza <dustin@virtualroadside.com>
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
@ -102,3 +102,31 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
 pycparser -- A C parser in Python
 Copyright (c) 2008-2022, Eli Bendersky
 All rights reserved.
 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:
 * Redistributions of source code must retain the above copyright notice, this 
  list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright notice, 
  this list of conditions and the following disclaimer in the documentation 
  and/or other materials provided with the distribution.
 * Neither the name of the copyright holder nor the names of its contributors may 
  be used to endorse or promote products derived from this software without 
  specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 
 GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 
 OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/cxxheaderparser/_ply/init.py
+++ b/cxxheaderparser/_ply/init.py
@ -0,0 +1,5 @@
 # PLY package
 # Author: David Beazley (dave@dabeaz.com)
 # https://github.com/dabeaz/ply
 __version__ = "2022.10.27"
--- a/cxxheaderparser/_ply/lex.py
+++ b/cxxheaderparser/_ply/lex.py
@ -2,7 +2,7 @@
 # -----------------------------------------------------------------------------
 # ply: lex.py
 #
-# Copyright (C) 2001-2020
+# Copyright (C) 2001-2022
 # David M. Beazley (Dabeaz LLC)
 # All rights reserved.
 #
--- a/cxxheaderparser/errors.py
+++ b/cxxheaderparser/errors.py
@ -1,6 +1,7 @@
 import typing
-from .lexer import LexToken
+if typing.TYPE_CHECKING:
    from .lexer import LexToken
 class CxxParseError(Exception):
--- a/cxxheaderparser/lexer.py
+++ b/cxxheaderparser/lexer.py
@ -5,6 +5,13 @@ import typing
 import sys
 from ._ply import lex
 from ._ply.lex import TOKEN
 from .errors import CxxParseError
 class LexError(CxxParseError):
    pass
 if sys.version_info >= (3, 8):
@ -43,7 +50,7 @@ class LexToken(Protocol):
    location: Location
    #: private
-    lexer: "Lexer"
+    lexer: lex.Lexer
 PhonyEnding: LexToken = lex.LexToken()  # type: ignore
@ -53,7 +60,14 @@ PhonyEnding.lineno = 0
 PhonyEnding.lexpos = 0
-class Lexer:
+class PlyLexer:
    """
    This lexer is a combination of pieces from the PLY lexers that CppHeaderParser
    and pycparser have.
    This tokenizes the input into tokens. The other lexer classes do more complex
    things with the tokens.
    """
    keywords = {
        "__attribute__",
@ -137,16 +151,35 @@ class Lexer:
    }
    tokens = [
-        "NUMBER",
+        # constants
-        "FLOAT_NUMBER",
+        "FLOAT_CONST",
        "HEX_FLOAT_CONST",
        "INT_CONST_HEX",
        "INT_CONST_BIN",
        "INT_CONST_OCT",
        "INT_CONST_DEC",
        "INT_CONST_CHAR",
        "CHAR_CONST",
        "WCHAR_CONST",
        "U8CHAR_CONST",
        "U16CHAR_CONST",
        "U32CHAR_CONST",
        # String literals
        "STRING_LITERAL",
        "WSTRING_LITERAL",
        "U8STRING_LITERAL",
        "U16STRING_LITERAL",
        "U32STRING_LITERAL",
        #
        "NAME",
        # Comments
        "COMMENT_SINGLELINE",
        "COMMENT_MULTILINE",
        "PRECOMP_MACRO",
        # misc
        "DIVIDE",
        "CHAR_LITERAL",
        "STRING_LITERAL",
        "NEWLINE",
        "WHITESPACE",
        "ELLIPSIS",
        "DBL_LBRACKET",
        "DBL_RBRACKET",
@ -182,40 +215,236 @@ class Lexer:
        ".",
    ]
-    t_ignore = " \t\r?@\f"
+    #
-    t_NUMBER = r"[0-9][0-9XxA-Fa-f]*"
+    # Regexes for use in tokens (taken from pycparser)
-    t_FLOAT_NUMBER = r"[-+]?[0-9]*\.[0-9]+([eE][-+]?[0-9]+)?"
+    #
    hex_prefix = "0[xX]"
    hex_digits = "[0-9a-fA-F]+"
    bin_prefix = "0[bB]"
    bin_digits = "[01]+"
    # integer constants (K&R2: A.2.5.1)
    integer_suffix_opt = (
        r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?"
    )
    decimal_constant = (
        "(0" + integer_suffix_opt + ")|([1-9][0-9]*" + integer_suffix_opt + ")"
    )
    octal_constant = "0[0-7]*" + integer_suffix_opt
    hex_constant = hex_prefix + hex_digits + integer_suffix_opt
    bin_constant = bin_prefix + bin_digits + integer_suffix_opt
    bad_octal_constant = "0[0-7]*[89]"
    # character constants (K&R2: A.2.5.2)
    # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
    # directives with Windows paths as filenames (..\..\dir\file)
    # For the same reason, decimal_escape allows all digit sequences. We want to
    # parse all correct code, even if it means to sometimes parse incorrect
    # code.
    #
    # The original regexes were taken verbatim from the C syntax definition,
    # and were later modified to avoid worst-case exponential running time.
    #
    #   simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
    #   decimal_escape = r"""(\d+)"""
    #   hex_escape = r"""(x[0-9a-fA-F]+)"""
    #   bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
    #
    # The following modifications were made to avoid the ambiguity that allowed backtracking:
    # (https://github.com/eliben/pycparser/issues/61)
    #
    # - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape.
    # - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex
    # - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal
    # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape.
    #
    # Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways.
    # e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`.
    simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
    decimal_escape = r"""(\d+)(?!\d)"""
    hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
    bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""
    escape_sequence = (
        r"""(\\(""" + simple_escape + "|" + decimal_escape + "|" + hex_escape + "))"
    )
    # This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed
    # 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to
    escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""
    cconst_char = r"""([^'\\\n]|""" + escape_sequence + ")"
    char_const = "'" + cconst_char + "'"
    wchar_const = "L" + char_const
    u8char_const = "u8" + char_const
    u16char_const = "u" + char_const
    u32char_const = "U" + char_const
    multicharacter_constant = "'" + cconst_char + "{2,4}'"
    unmatched_quote = "('" + cconst_char + "*\\n)|('" + cconst_char + "*$)"
    bad_char_const = (
        r"""('"""
        + cconst_char
        + """[^'\n]+')|('')|('"""
        + bad_escape
        + r"""[^'\n]*')"""
    )
    # string literals (K&R2: A.2.6)
    string_char = r"""([^"\\\n]|""" + escape_sequence_start_in_string + ")"
    string_literal = '"' + string_char + '*"'
    wstring_literal = "L" + string_literal
    u8string_literal = "u8" + string_literal
    u16string_literal = "u" + string_literal
    u32string_literal = "U" + string_literal
    bad_string_literal = '"' + string_char + "*" + bad_escape + string_char + '*"'
    # floating constants (K&R2: A.2.5.3)
    exponent_part = r"""([eE][-+]?[0-9]+)"""
    fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
    floating_constant = (
        "(((("
        + fractional_constant
        + ")"
        + exponent_part
        + "?)|([0-9]+"
        + exponent_part
        + "))[FfLl]?)"
    )
    binary_exponent_part = r"""([pP][+-]?[0-9]+)"""
    hex_fractional_constant = (
        "(((" + hex_digits + r""")?\.""" + hex_digits + ")|(" + hex_digits + r"""\.))"""
    )
    hex_floating_constant = (
        "("
        + hex_prefix
        + "("
        + hex_digits
        + "|"
        + hex_fractional_constant
        + ")"
        + binary_exponent_part
        + "[FfLl]?)"
    )
    t_WHITESPACE = "[ \t]+"
    t_ignore = "\r"
    # The following floating and integer constants are defined as
    # functions to impose a strict order (otherwise, decimal
    # is placed before the others because its regex is longer,
    # and this is bad)
    #
    @TOKEN(floating_constant)
    def t_FLOAT_CONST(self, t: LexToken) -> LexToken:
        return t
    @TOKEN(hex_floating_constant)
    def t_HEX_FLOAT_CONST(self, t: LexToken) -> LexToken:
        return t
    @TOKEN(hex_constant)
    def t_INT_CONST_HEX(self, t: LexToken) -> LexToken:
        return t
    @TOKEN(bin_constant)
    def t_INT_CONST_BIN(self, t: LexToken) -> LexToken:
        return t
    @TOKEN(bad_octal_constant)
    def t_BAD_CONST_OCT(self, t: LexToken) -> None:
        msg = "Invalid octal constant"
        self._error(msg, t)
    @TOKEN(octal_constant)
    def t_INT_CONST_OCT(self, t: LexToken) -> LexToken:
        return t
    @TOKEN(decimal_constant)
    def t_INT_CONST_DEC(self, t: LexToken) -> LexToken:
        return t
    # Must come before bad_char_const, to prevent it from
    # catching valid char constants as invalid
    #
    @TOKEN(multicharacter_constant)
    def t_INT_CONST_CHAR(self, t: LexToken) -> LexToken:
        return t
    @TOKEN(char_const)
    def t_CHAR_CONST(self, t: LexToken) -> LexToken:
        return t
    @TOKEN(wchar_const)
    def t_WCHAR_CONST(self, t: LexToken) -> LexToken:
        return t
    @TOKEN(u8char_const)
    def t_U8CHAR_CONST(self, t: LexToken) -> LexToken:
        return t
    @TOKEN(u16char_const)
    def t_U16CHAR_CONST(self, t: LexToken) -> LexToken:
        return t
    @TOKEN(u32char_const)
    def t_U32CHAR_CONST(self, t: LexToken) -> LexToken:
        return t
    @TOKEN(unmatched_quote)
    def t_UNMATCHED_QUOTE(self, t: LexToken) -> None:
        msg = "Unmatched '"
        self._error(msg, t)
    @TOKEN(bad_char_const)
    def t_BAD_CHAR_CONST(self, t: LexToken) -> None:
        msg = "Invalid char constant %s" % t.value
        self._error(msg, t)
    @TOKEN(wstring_literal)
    def t_WSTRING_LITERAL(self, t: LexToken) -> LexToken:
        return t
    @TOKEN(u8string_literal)
    def t_U8STRING_LITERAL(self, t: LexToken) -> LexToken:
        return t
    @TOKEN(u16string_literal)
    def t_U16STRING_LITERAL(self, t: LexToken) -> LexToken:
        return t
    @TOKEN(u32string_literal)
    def t_U32STRING_LITERAL(self, t: LexToken) -> LexToken:
        return t
    # unmatched string literals are caught by the preprocessor
    @TOKEN(bad_string_literal)
    def t_BAD_STRING_LITERAL(self, t):
        msg = "String contains invalid escape code"
        self._error(msg, t)
    @TOKEN(r"[A-Za-z_~][A-Za-z0-9_]*")
    def t_NAME(self, t: LexToken) -> LexToken:
        r"[A-Za-z_~][A-Za-z0-9_]*"
        if t.value in self.keywords:
            t.type = t.value
        return t
    @TOKEN(r"\#.*")
    def t_PRECOMP_MACRO(self, t: LexToken) -> typing.Optional[LexToken]:
        r"\#.*"
        m = _line_re.match(t.value)
        if m:
-            filename = m.group(2)
+            self.filename = m.group(2)
            if filename not in self._filenames_set:
                self.filenames.append(filename)
                self._filenames_set.add(filename)
            self.filename = filename
            self.line_offset = 1 + self.lex.lineno - int(m.group(1))
            return None
        else:
            return t
    def t_COMMENT_SINGLELINE(self, t: LexToken) -> LexToken:
        r"\/\/.*\n?"
        if t.value.startswith("///") or t.value.startswith("//!"):
            self.comments.append(t.value.lstrip("\t ").rstrip("\n"))
        t.lexer.lineno += t.value.count("\n")
        return t
    t_DIVIDE = r"/(?!/)"
    t_CHAR_LITERAL = "'.'"
    t_ELLIPSIS = r"\.\.\."
    t_DBL_LBRACKET = r"\[\["
    t_DBL_RBRACKET = r"\]\]"
@ -225,36 +454,35 @@ class Lexer:
    t_SHIFT_LEFT = r"<<"
    # SHIFT_RIGHT introduces ambiguity
-    # found at http://wordaligned.org/articles/string-literals-and-regular-expressions
+    t_STRING_LITERAL = string_literal
    # TODO: This does not work with the string "bla \" bla"
    t_STRING_LITERAL = r'"([^"\\]|\\.)*"'
-    # Found at http://ostermiller.org/findcomment.html
+    @TOKEN(r"\/\/.*\n?")
-    def t_COMMENT_MULTILINE(self, t: LexToken) -> LexToken:
+    def t_COMMENT_SINGLELINE(self, t: LexToken) -> LexToken:
        r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/\n?"
        if t.value.startswith("/**") or t.value.startswith("/*!"):
            # not sure why, but get double new lines
            v = t.value.replace("\n\n", "\n")
            # strip prefixing whitespace
            v = _multicomment_re.sub("\n*", v)
            self.comments = v.splitlines()
        t.lexer.lineno += t.value.count("\n")
        return t
    # Found at http://ostermiller.org/findcomment.html
    @TOKEN(r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/\n?")
    def t_COMMENT_MULTILINE(self, t: LexToken) -> LexToken:
        t.lexer.lineno += t.value.count("\n")
        return t
    @TOKEN(r"\n+")
    def t_NEWLINE(self, t: LexToken) -> LexToken:
        r"\n+"
        t.lexer.lineno += len(t.value)
        del self.comments[:]
        return t
    def t_error(self, t: LexToken) -> None:
-        print("Lex error: ", t)
+        self._error(f"Illegal character {t.value!r}", t)
    def _error(self, msg: str, tok: LexToken):
        tok.location = self.current_location()
        raise LexError(msg, tok)
    _lexer = None
    lex: lex.Lexer
    lineno: int
-    def __new__(cls, *args, **kwargs) -> "Lexer":
+    def __new__(cls, *args, **kwargs) -> "PlyLexer":
        # only build the lexer once
        inst = super().__new__(cls)
        if cls._lexer is None:
@ -266,164 +494,80 @@ class Lexer:
    def __init__(self, filename: typing.Optional[str] = None):
        self.input: typing.Callable[[str], None] = self.lex.input
        self.token: typing.Callable[[], LexToken] = self.lex.token
        # For tracking current file/line position
        self.filename = filename
        self.line_offset = 0
-        self.filenames: typing.List[str] = []
+    def current_location(self) -> Location:
-        self._filenames_set: typing.Set[str] = set()
+        return Location(self.filename, self.lex.lineno - self.line_offset)
        if filename:
            self.filenames.append(filename)
            self._filenames_set.add(filename)
-        # Doxygen comments
+class TokenStream:
-        self.comments = []
+    """
    Provides access to a stream of tokens
    """
-        self.lookahead = typing.Deque[LexToken]()
+    tokbuf: typing.Deque[LexToken]
-        # For 'set_group_of_tokens' support
+    def _fill_tokbuf(self, tokbuf: typing.Deque[LexToken]) -> bool:
-        self._get_token: typing.Callable[[], LexToken] = self.lex.token
+        """
-        self.lookahead_stack = typing.Deque[typing.Deque[LexToken]]()
+        Fills tokbuf with tokens from the next line. Return True if at least
        one token was added to the buffer
        """
        raise NotImplementedError
    def current_location(self) -> Location:
-        if self.lookahead:
+        raise NotImplementedError
            return self.lookahead[0].location
        return Location(self.filename, self.lex.lineno - self.line_offset)
    def get_doxygen(self) -> typing.Optional[str]:
        """
-        This should be called after the first element of something has
+        This is called at the point that you want doxygen information
        been consumed.
        It will lookahead for comments that come after the item, if prior
        comments don't exist.
        """
        raise NotImplementedError
-        # Assumption: This function is either called at the beginning of a
+    def get_doxygen_after(self) -> typing.Optional[str]:
-        # statement or at the end of a statement
+        """
        This is called to retrieve doxygen information after a statement
        """
        raise NotImplementedError
-        if self.comments:
+    _discard_types = {
-            comments = self.comments
+        "NEWLINE",
-        else:
+        "COMMENT_SINGLELINE",
-            comments = []
+        "COMMENT_MULTILINE",
-            # only look for comments until a newline (including lookahead)
+        "WHITESPACE",
-            for tok in self.lookahead:
+    }
                if tok.type == "NEWLINE":
                    return None
            while True:
                tok = self._get_token()
                comments.extend(self.comments)
                if tok is None:
                    break
                tok.location = Location(self.filename, tok.lineno - self.line_offset)
                ttype = tok.type
                if ttype == "NEWLINE":
                    self.lookahead.append(tok)
                    break
                if ttype not in self._discard_types:
                    self.lookahead.append(tok)
                if ttype == "NAME":
                    break
                del self.comments[:]
        comment_str = "\n".join(comments)
        del self.comments[:]
        if comment_str:
            return comment_str
        return None
    _discard_types = {"NEWLINE", "COMMENT_SINGLELINE", "COMMENT_MULTILINE"}
    def _token_limit_exceeded(self) -> typing.NoReturn:
        from .errors import CxxParseError
        raise CxxParseError("no more tokens left in this group")
    @contextlib.contextmanager
    def set_group_of_tokens(
        self, toks: typing.List[LexToken]
    ) -> typing.Generator[typing.Deque[LexToken], None, None]:
        # intended for use when you have a set of tokens that you know
        # must be consumed, such as a paren grouping or some type of
        # lookahead case
        stack = self.lookahead_stack
        restore_fn = False
        if not stack:
            restore_fn = True
            self._get_token = self._token_limit_exceeded
        this_buf = typing.Deque[LexToken](toks)
        prev_buf = self.lookahead
        stack.append(prev_buf)
        self.lookahead = this_buf
        try:
            yield this_buf
        finally:
            buf = stack.pop()
            if prev_buf is not buf:
                raise ValueError("internal error")
            self.lookahead = prev_buf
            if restore_fn:
                self._get_token = self.lex.token
    def token(self) -> LexToken:
-        tok = None
+        tokbuf = self.tokbuf
-        while self.lookahead:
+        while True:
-            tok = self.lookahead.popleft()
+            while tokbuf:
                tok = tokbuf.popleft()
                if tok.type not in self._discard_types:
                    return tok
-        while True:
+            if not self._fill_tokbuf(tokbuf):
            tok = self._get_token()
            if tok is None:
                raise EOFError("unexpected end of file")
            if tok.type not in self._discard_types:
                tok.location = Location(self.filename, tok.lineno - self.line_offset)
                break
        return tok
    def token_eof_ok(self) -> typing.Optional[LexToken]:
-        tok = None
+        tokbuf = self.tokbuf
        while self.lookahead:
            tok = self.lookahead.popleft()
            if tok.type not in self._discard_types:
                return tok
        while True:
-            tok = self._get_token()
+            while tokbuf:
-            if tok is None:
+                tok = tokbuf.popleft()
                break
                if tok.type not in self._discard_types:
                tok.location = Location(self.filename, tok.lineno - self.line_offset)
                break
                    return tok
            if not self._fill_tokbuf(tokbuf):
                return None
    def token_if(self, *types: str) -> typing.Optional[LexToken]:
        tok = self.token_eof_ok()
        if tok is None:
            return None
        if tok.type not in types:
-            # put it back on the left in case it was retrieved
+            self.tokbuf.appendleft(tok)
            # from the lookahead buffer
            self.lookahead.appendleft(tok)
            return None
        return tok
@ -432,9 +576,7 @@ class Lexer:
        if tok is None:
            return None
        if tok.type not in types:
-            # put it back on the left in case it was retrieved
+            self.tokbuf.appendleft(tok)
            # from the lookahead buffer
            self.lookahead.appendleft(tok)
            return None
        return tok
@ -443,9 +585,7 @@ class Lexer:
        if tok is None:
            return None
        if tok.value not in vals:
-            # put it back on the left in case it was retrieved
+            self.tokbuf.appendleft(tok)
            # from the lookahead buffer
            self.lookahead.appendleft(tok)
            return None
        return tok
@ -454,9 +594,7 @@ class Lexer:
        if tok is None:
            return None
        if tok.type in types:
-            # put it back on the left in case it was retrieved
+            self.tokbuf.appendleft(tok)
            # from the lookahead buffer
            self.lookahead.appendleft(tok)
            return None
        return tok
@ -464,18 +602,217 @@ class Lexer:
        tok = self.token_eof_ok()
        if not tok:
            return False
-        self.lookahead.appendleft(tok)
+        self.tokbuf.appendleft(tok)
        return tok.type in types
    def return_token(self, tok: LexToken) -> None:
-        self.lookahead.appendleft(tok)
+        self.tokbuf.appendleft(tok)
    def return_tokens(self, toks: typing.Sequence[LexToken]) -> None:
-        self.lookahead.extendleft(reversed(toks))
+        self.tokbuf.extendleft(reversed(toks))
 class LexerTokenStream(TokenStream):
    """
    Provides tokens from using PlyLexer on the given input text
    """
    _user_defined_literal_start = {
        "FLOAT_CONST",
        "HEX_FLOAT_CONST",
        "INT_CONST_HEX",
        "INT_CONST_BIN",
        "INT_CONST_OCT",
        "INT_CONST_DEC",
        "INT_CONST_CHAR",
        "CHAR_CONST",
        "WCHAR_CONST",
        "U8CHAR_CONST",
        "U16CHAR_CONST",
        "U32CHAR_CONST",
        # String literals
        "STRING_LITERAL",
        "WSTRING_LITERAL",
        "U8STRING_LITERAL",
        "U16STRING_LITERAL",
        "U32STRING_LITERAL",
    }
    def __init__(self, filename: typing.Optional[str], content: str) -> None:
        self._lex = PlyLexer(filename)
        self._lex.input(content)
        self.tokbuf = typing.Deque[LexToken]()
    def _fill_tokbuf(self, tokbuf: typing.Deque[LexToken]) -> bool:
        get_token = self._lex.token
        tokbuf = self.tokbuf
        tok = get_token()
        if tok is None:
            return False
        udl_start = self._user_defined_literal_start
        while True:
            tok.location = self._lex.current_location()
            tokbuf.append(tok)
            if tok.type == "NEWLINE":
                break
            # detect/combine user defined literals
            if tok.type in udl_start:
                tok2 = get_token()
                if tok2 is None:
                    break
                if tok2.type != "NAME" or tok2.value[0] != "_":
                    tok = tok2
                    continue
                tok.value = tok.value + tok2.value
                tok.type = f"UD_{tok.type}"
            tok = get_token()
            if tok is None:
                break
        return True
    def current_location(self) -> Location:
        if self.tokbuf:
            return self.tokbuf[0].location
        return self._lex.current_location()
    def get_doxygen(self) -> typing.Optional[str]:
        tokbuf = self.tokbuf
        # fill the token buffer if it's empty (which indicates a newline)
        if not tokbuf and not self._fill_tokbuf(tokbuf):
            return None
        comments: typing.List[LexToken] = []
        # retrieve any comments in the stream right before
        # the first non-discard element
        keep_going = True
        while True:
            while tokbuf:
                tok = tokbuf.popleft()
                if tok.type == "NEWLINE":
                    comments.clear()
                elif tok.type == "WHITESPACE":
                    pass
                elif tok.type in ("COMMENT_SINGLELINE", "COMMENT_MULTILINE"):
                    comments.append(tok)
                else:
                    tokbuf.appendleft(tok)
                    keep_going = False
                    break
            if not keep_going:
                break
            if not self._fill_tokbuf(tokbuf):
                break
        if comments:
            return self._extract_comments(comments)
        return None
    def get_doxygen_after(self) -> typing.Optional[str]:
        tokbuf = self.tokbuf
        # if there's a newline directly after a statement, we're done
        if not tokbuf:
            return None
        # retrieve comments after non-discard elements
        comments: typing.List[LexToken] = []
        new_tokbuf = typing.Deque[LexToken]()
        # This is different: we only extract tokens here
        while tokbuf:
            tok = tokbuf.popleft()
            if tok.type == "NEWLINE":
                break
            elif tok.type == "WHITESPACE":
                new_tokbuf.append(tok)
            elif tok.type in ("COMMENT_SINGLELINE", "COMMENT_MULTILINE"):
                comments.append(tok)
            else:
                new_tokbuf.append(tok)
                if comments:
                    break
        new_tokbuf.extend(tokbuf)
        self.tokbuf = new_tokbuf
        if comments:
            return self._extract_comments(comments)
        return None
    def _extract_comments(self, comments: typing.List[LexToken]):
        # Now we have comments, need to extract the text from them
        comment_lines: typing.List[str] = []
        for c in comments:
            text = c.value
            if c.type == "COMMENT_SINGLELINE":
                if text.startswith("///") or text.startswith("//!"):
                    comment_lines.append(text.rstrip("\n"))
            else:
                if text.startswith("/**") or text.startswith("/*!"):
                    # not sure why, but get double new lines
                    text = text.replace("\n\n", "\n")
                    # strip prefixing whitespace
                    text = _multicomment_re.sub("\n*", text)
                    comment_lines = text.splitlines()
        comment_str = "\n".join(comment_lines)
        if comment_str:
            return comment_str
        return None
 class BoundedTokenStream(TokenStream):
    """
    Provides tokens from a fixed list of tokens.
    Intended for use when you have a group of tokens that you know
    must be consumed, such as a paren grouping or some type of
    lookahead case
    """
    def __init__(self, toks: typing.List[LexToken]) -> None:
        self.tokbuf = typing.Deque[LexToken](toks)
    def has_tokens(self) -> bool:
        return len(self.tokbuf) > 0
    def _fill_tokbuf(self, tokbuf: typing.Deque[LexToken]) -> bool:
        from .errors import CxxParseError
        raise CxxParseError("no more tokens left in this group")
    def current_location(self) -> Location:
        if self.tokbuf:
            return self.tokbuf[0].location
        raise ValueError("internal error")
    def get_doxygen(self) -> typing.Optional[str]:
        # comment tokens aren't going to be in this stream
        return None
    def get_doxygen_after(self) -> typing.Optional[str]:
        return None
 if __name__ == "__main__":  # pragma: no cover
    try:
-        lex.runmain(lexer=Lexer(None))
+        lex.runmain(lexer=PlyLexer(None))
    except EOFError:
        pass
--- a/cxxheaderparser/parser.py
+++ b/cxxheaderparser/parser.py
@ -4,8 +4,9 @@ import inspect
 import re
 import typing
 from . import lexer
 from .errors import CxxParseError
-from .lexer import Lexer, LexToken, Location, PhonyEnding
+from .lexer import LexToken, Location, PhonyEnding
 from .options import ParserOptions
 from .parserstate import (
    ClassBlockState,
@ -80,8 +81,7 @@ class CxxParser:
        self.visitor = visitor
        self.filename = filename
-        self.lex = Lexer(filename)
+        self.lex: lexer.TokenStream = lexer.LexerTokenStream(filename, content)
        self.lex.input(content)
        global_ns = NamespaceDecl([], False)
        self.current_namespace = global_ns
@ -308,25 +308,34 @@ class CxxParser:
            ";": lambda _1, _2: None,
        }
        _keep_doxygen = {"__declspec", "alignas", "__attribute__", "DBL_LBRACKET"}
        tok = None
        get_token_eof_ok = self.lex.token_eof_ok
        get_doxygen = self.lex.get_doxygen
        doxygen = None
        try:
            while True:
                if doxygen is None:
                    doxygen = get_doxygen()
                tok = get_token_eof_ok()
                if not tok:
                    break
                doxygen = get_doxygen()
                fn = _translation_unit_tokens.get(tok.type)
                if fn:
                    fn(tok, doxygen)
                    if tok.type not in _keep_doxygen:
                        doxygen = None
                else:
                    # this processes ambiguous declarations
                    self._parse_declarations(tok, doxygen)
                    doxygen = None
        except Exception as e:
            if self.verbose:
@ -610,7 +619,12 @@ class CxxParser:
                # append a token to make other parsing components happy
                raw_toks.append(PhonyEnding)
-                with self.lex.set_group_of_tokens(raw_toks) as remainder:
+                old_lex = self.lex
                try:
                    # set up a temporary token stream with the tokens we need to parse
                    tmp_lex = lexer.BoundedTokenStream(raw_toks)
                    self.lex = tmp_lex
                    try:
                        parsed_type, mods = self._parse_type(None)
                        if parsed_type is None:
@ -622,9 +636,12 @@ class CxxParser:
                    except CxxParseError:
                        dtype = None
                    else:
-                        if remainder:
+                        if tmp_lex.has_tokens():
                            dtype = None
                finally:
                    self.lex = old_lex
            if self.lex.token_if("ELLIPSIS"):
                param_pack = True
@ -939,12 +956,16 @@ class CxxParser:
        values: typing.List[Enumerator] = []
        while True:
            doxygen = self.lex.get_doxygen()
            name_tok = self._next_token_must_be("}", "NAME")
            if name_tok.value == "}":
                break
            if doxygen is None:
                doxygen = self.lex.get_doxygen_after()
            name = name_tok.value
            doxygen = self.lex.get_doxygen()
            value = None
            tok = self._next_token_must_be("}", ",", "=", "DBL_LBRACKET")
@ -1171,7 +1192,7 @@ class CxxParser:
    def _parse_bitfield(self) -> int:
        # is a integral constant expression... for now, just do integers
-        tok = self._next_token_must_be("NUMBER")
+        tok = self._next_token_must_be("INT_CONST_DEC")
        return int(tok.value)
    def _parse_field(
@ -1244,7 +1265,7 @@ class CxxParser:
        if doxygen is None:
            # try checking after the var
-            doxygen = self.lex.get_doxygen()
+            doxygen = self.lex.get_doxygen_after()
        if is_typedef:
            if not name:
--- a/cxxheaderparser/tokfmt.py
+++ b/cxxheaderparser/tokfmt.py
@ -1,15 +1,28 @@
 import typing
-from .lexer import LexToken, Lexer
+from .lexer import LexToken, PlyLexer, LexerTokenStream
 from .types import Token
 # key: token type, value: (left spacing, right spacing)
 _want_spacing = {
-    "NUMBER": (2, 2),
+    "FLOAT_CONST": (2, 2),
-    "FLOAT_NUMBER": (2, 2),
+    "HEX_FLOAT_CONST": (2, 2),
    "INT_CONST_HEX": (2, 2),
    "INT_CONST_BIN": (2, 2),
    "INT_CONST_OCT": (2, 2),
    "INT_CONST_DEC": (2, 2),
    "INT_CONST_CHAR": (2, 2),
    "NAME": (2, 2),
-    "CHAR_LITERAL": (2, 2),
+    "CHAR_CONST": (2, 2),
    "WCHAR_CONST": (2, 2),
    "U8CHAR_CONST": (2, 2),
    "U16CHAR_CONST": (2, 2),
    "U32CHAR_CONST": (2, 2),
    "STRING_LITERAL": (2, 2),
    "WSTRING_LITERAL": (2, 2),
    "U8STRING_LITERAL": (2, 2),
    "U16STRING_LITERAL": (2, 2),
    "U32STRING_LITERAL": (2, 2),
    "ELLIPSIS": (2, 2),
    ">": (0, 2),
    ")": (0, 1),
@ -19,7 +32,7 @@ _want_spacing = {
    "&": (0, 2),
 }
-_want_spacing.update(dict.fromkeys(Lexer.keywords, (2, 2)))
+_want_spacing.update(dict.fromkeys(PlyLexer.keywords, (2, 2)))
 def tokfmt(toks: typing.List[Token]) -> str:
@ -54,9 +67,9 @@ if __name__ == "__main__":  # pragma: no cover
    parser.add_argument("header")
    args = parser.parse_args()
-    lexer = Lexer(args.header)
+    filename: str = args.header
-    with open(lexer.filename) as fp:
+    with open(filename) as fp:
-        lexer.input(fp.read())  # type: ignore
+        lexer = LexerTokenStream(filename, fp.read())
    toks: typing.List[Token] = []
    while True:
--- a/tests/test_doxygen.py
+++ b/tests/test_doxygen.py
@ -209,16 +209,13 @@ def test_doxygen_fn_3slash() -> None:
    )
-def test_doxygen_fn_cstyle() -> None:
+def test_doxygen_fn_cstyle1() -> None:
    content = """
      // clang-format off
      /**
       * fn comment
       */
      void
      fn();
    """
    data = parse_string(content, cleandoc=True)
@ -238,6 +235,32 @@ def test_doxygen_fn_cstyle() -> None:
    )
 def test_doxygen_fn_cstyle2() -> None:
    content = """
      /*!
      * fn comment
      */
      void
      fn();
    """
    data = parse_string(content, cleandoc=True)
    assert data == ParsedData(
        namespace=NamespaceScope(
            functions=[
                Function(
                    return_type=Type(
                        typename=PQName(segments=[FundamentalSpecifier(name="void")])
                    ),
                    name=PQName(segments=[NameSpecifier(name="fn")]),
                    parameters=[],
                    doxygen="/*!\n* fn comment\n*/",
                )
            ]
        )
    )
 def test_doxygen_var_above() -> None:
    content = """
      // clang-format off
@ -292,6 +315,44 @@ def test_doxygen_var_after() -> None:
    )
 def test_doxygen_multiple_variables() -> None:
    content = """
      int x; /// this is x
      int y; /// this is y
             /// this is also y
      int z; /// this is z
    """
    data = parse_string(content, cleandoc=True)
    assert data == ParsedData(
        namespace=NamespaceScope(
            variables=[
                Variable(
                    name=PQName(segments=[NameSpecifier(name="x")]),
                    type=Type(
                        typename=PQName(segments=[FundamentalSpecifier(name="int")])
                    ),
                    doxygen="/// this is x",
                ),
                Variable(
                    name=PQName(segments=[NameSpecifier(name="y")]),
                    type=Type(
                        typename=PQName(segments=[FundamentalSpecifier(name="int")])
                    ),
                    doxygen="/// this is y\n/// this is also y",
                ),
                Variable(
                    name=PQName(segments=[NameSpecifier(name="z")]),
                    type=Type(
                        typename=PQName(segments=[FundamentalSpecifier(name="int")])
                    ),
                    doxygen="/// this is z",
                ),
            ]
        )
    )
 def test_doxygen_namespace() -> None:
    content = """
      /**
@ -329,3 +390,50 @@ def test_doxygen_namespace() -> None:
            }
        )
    )
 def test_doxygen_declspec() -> None:
    content = """
      /// declspec comment
      __declspec(thread) int i = 1;
    """
    data = parse_string(content, cleandoc=True)
    assert data == ParsedData(
        namespace=NamespaceScope(
            variables=[
                Variable(
                    name=PQName(segments=[NameSpecifier(name="i")]),
                    type=Type(
                        typename=PQName(segments=[FundamentalSpecifier(name="int")])
                    ),
                    value=Value(tokens=[Token(value="1")]),
                    doxygen="/// declspec comment",
                )
            ]
        )
    )
 def test_doxygen_attribute() -> None:
    content = """
      /// hasattr comment
      [[nodiscard]]
      int hasattr();
    """
    data = parse_string(content, cleandoc=True)
    assert data == ParsedData(
        namespace=NamespaceScope(
            functions=[
                Function(
                    return_type=Type(
                        typename=PQName(segments=[FundamentalSpecifier(name="int")])
                    ),
                    name=PQName(segments=[NameSpecifier(name="hasattr")]),
                    parameters=[],
                    doxygen="/// hasattr comment",
                )
            ]
        )
    )
--- a/tests/test_misc.py
+++ b/tests/test_misc.py
@ -236,3 +236,34 @@ def test_final() -> None:
            ],
        )
    )
 #
 # User defined literals
 #
 def test_user_defined_literal() -> None:
    content = """
      units::volt_t v = 1_V;
    """
    data = parse_string(content, cleandoc=True)
    assert data == ParsedData(
        namespace=NamespaceScope(
            variables=[
                Variable(
                    name=PQName(segments=[NameSpecifier(name="v")]),
                    type=Type(
                        typename=PQName(
                            segments=[
                                NameSpecifier(name="units"),
                                NameSpecifier(name="volt_t"),
                            ]
                        )
                    ),
                    value=Value(tokens=[Token(value="1_V")]),
                )
            ]
        )
    )
--- a/tests/test_tokfmt.py
+++ b/tests/test_tokfmt.py
@ -1,6 +1,6 @@
 import pytest
-from cxxheaderparser.lexer import Lexer
+from cxxheaderparser.lexer import PlyLexer, LexerTokenStream
 from cxxheaderparser.tokfmt import tokfmt
 from cxxheaderparser.types import Token
@ -40,14 +40,15 @@ def test_tokfmt(instr: str) -> None:
    Each input string is exactly what the output of tokfmt should be
    """
    toks = []
-    lexer = Lexer("")
+    lexer = PlyLexer("")
    lexer.input(instr)
    while True:
-        tok = lexer.token_eof_ok()
+        tok = lexer.token()
        if not tok:
            break
        if tok.type not in LexerTokenStream._discard_types:
            toks.append(Token(tok.value, tok.type))
    assert tokfmt(toks) == instr