From b47eb7ce10c7f5e67bd9059ac541e837ad9d2049 Mon Sep 17 00:00:00 2001
From: Dustin Spicuzza <dustin@virtualroadside.com>
Date: Sat, 10 Dec 2022 14:08:59 -0500
Subject: [PATCH 1/9] Use lex.TOKEN decorator for lexer tokens instead of
 docstrings

- Allows usage with -OO
---
 cxxheaderparser/lexer.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py
index 9650190..83032a7 100644
--- a/cxxheaderparser/lexer.py
+++ b/cxxheaderparser/lexer.py
@@ -5,6 +5,7 @@ import typing
 import sys
 
 from ._ply import lex
+from ._ply.lex import TOKEN
 
 
 if sys.version_info >= (3, 8):
@@ -186,14 +187,14 @@ class Lexer:
     t_NUMBER = r"[0-9][0-9XxA-Fa-f]*"
     t_FLOAT_NUMBER = r"[-+]?[0-9]*\.[0-9]+([eE][-+]?[0-9]+)?"
 
+    @TOKEN(r"[A-Za-z_~][A-Za-z0-9_]*")
     def t_NAME(self, t: LexToken) -> LexToken:
-        r"[A-Za-z_~][A-Za-z0-9_]*"
         if t.value in self.keywords:
             t.type = t.value
         return t
 
+    @TOKEN(r"\#.*")
     def t_PRECOMP_MACRO(self, t: LexToken) -> typing.Optional[LexToken]:
-        r"\#.*"
         m = _line_re.match(t.value)
         if m:
             filename = m.group(2)
@@ -207,8 +208,8 @@ class Lexer:
         else:
             return t
 
+    @TOKEN(r"\/\/.*\n?")
     def t_COMMENT_SINGLELINE(self, t: LexToken) -> LexToken:
-        r"\/\/.*\n?"
         if t.value.startswith("///") or t.value.startswith("//!"):
             self.comments.append(t.value.lstrip("\t ").rstrip("\n"))
         t.lexer.lineno += t.value.count("\n")
@@ -230,8 +231,8 @@ class Lexer:
     t_STRING_LITERAL = r'"([^"\\]|\\.)*"'
 
     # Found at http://ostermiller.org/findcomment.html
+    @TOKEN(r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/\n?")
     def t_COMMENT_MULTILINE(self, t: LexToken) -> LexToken:
-        r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/\n?"
         if t.value.startswith("/**") or t.value.startswith("/*!"):
             # not sure why, but get double new lines
             v = t.value.replace("\n\n", "\n")
@@ -241,8 +242,8 @@ class Lexer:
         t.lexer.lineno += t.value.count("\n")
         return t
 
+    @TOKEN(r"\n+")
     def t_NEWLINE(self, t: LexToken) -> LexToken:
-        r"\n+"
         t.lexer.lineno += len(t.value)
         del self.comments[:]
         return t

From 03c24a207440dbcbb7084f3d0c9838990cb65605 Mon Sep 17 00:00:00 2001
From: Dustin Spicuzza <dustin@virtualroadside.com>
Date: Sat, 10 Dec 2022 14:48:20 -0500
Subject: [PATCH 2/9] Better lexer error handling

---
 cxxheaderparser/errors.py |  3 ++-
 cxxheaderparser/lexer.py  | 12 +++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/cxxheaderparser/errors.py b/cxxheaderparser/errors.py
index e08b510..d457f51 100644
--- a/cxxheaderparser/errors.py
+++ b/cxxheaderparser/errors.py
@@ -1,6 +1,7 @@
 import typing
 
-from .lexer import LexToken
+if typing.TYPE_CHECKING:
+    from .lexer import LexToken
 
 
 class CxxParseError(Exception):
diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py
index 83032a7..36c6c34 100644
--- a/cxxheaderparser/lexer.py
+++ b/cxxheaderparser/lexer.py
@@ -7,6 +7,12 @@ import sys
 from ._ply import lex
 from ._ply.lex import TOKEN
 
+from .errors import CxxParseError
+
+
+class LexError(CxxParseError):
+    pass
+
 
 if sys.version_info >= (3, 8):
     from typing import Protocol
@@ -249,7 +255,11 @@ class Lexer:
         return t
 
     def t_error(self, t: LexToken) -> None:
-        print("Lex error: ", t)
+        self._error(f"Illegal character {t.value!r}", t)
+
+    def _error(self, msg: str, tok: LexToken):
+        tok.location = self.current_location()
+        raise LexError(msg, tok)
 
     _lexer = None
     lex: lex.Lexer

From aee776072eac004dc8a08d53e95a62090544dcd9 Mon Sep 17 00:00:00 2001
From: Dustin Spicuzza <dustin@virtualroadside.com>
Date: Sat, 10 Dec 2022 15:16:05 -0500
Subject: [PATCH 3/9] Grab string/character lexer constants from pycparser

---
 LICENSE.txt               |  30 ++++-
 cxxheaderparser/lexer.py  | 246 ++++++++++++++++++++++++++++++++++++--
 cxxheaderparser/parser.py |   2 +-
 cxxheaderparser/tokfmt.py |  19 ++-
 4 files changed, 282 insertions(+), 15 deletions(-)

diff --git a/LICENSE.txt b/LICENSE.txt
index 1d8f05b..5d5e3a0 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,6 +1,6 @@
 cxxheaderparser license:
 
-Copyright (c) 2020 Dustin Spicuzza <dustin@virtualroadside.com>
+Copyright (c) 2020-2022 Dustin Spicuzza <dustin@virtualroadside.com>
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -102,3 +102,31 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 -----------------------------------------------------------------------------
+
+pycparser -- A C parser in Python
+
+Copyright (c) 2008-2022, Eli Bendersky
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this 
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, 
+  this list of conditions and the following disclaimer in the documentation 
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors may 
+  be used to endorse or promote products derived from this software without 
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py
index 36c6c34..f8d3285 100644
--- a/cxxheaderparser/lexer.py
+++ b/cxxheaderparser/lexer.py
@@ -61,6 +61,10 @@ PhonyEnding.lexpos = 0
 
 
 class Lexer:
+    """
+    This lexer is a combination of pieces from the PLY lexers that CppHeaderParser
+    and pycparser have.
+    """
 
     keywords = {
         "__attribute__",
@@ -144,15 +148,33 @@ class Lexer:
     }
 
     tokens = [
-        "NUMBER",
-        "FLOAT_NUMBER",
+        # constants
+        "FLOAT_CONST",
+        "HEX_FLOAT_CONST",
+        "INT_CONST_HEX",
+        "INT_CONST_BIN",
+        "INT_CONST_OCT",
+        "INT_CONST_DEC",
+        "INT_CONST_CHAR",
+        "CHAR_CONST",
+        "WCHAR_CONST",
+        "U8CHAR_CONST",
+        "U16CHAR_CONST",
+        "U32CHAR_CONST",
+        # String literals
+        "STRING_LITERAL",
+        "WSTRING_LITERAL",
+        "U8STRING_LITERAL",
+        "U16STRING_LITERAL",
+        "U32STRING_LITERAL",
+        #
         "NAME",
+        # Comments
         "COMMENT_SINGLELINE",
         "COMMENT_MULTILINE",
         "PRECOMP_MACRO",
+        # misc
         "DIVIDE",
-        "CHAR_LITERAL",
-        "STRING_LITERAL",
         "NEWLINE",
         "ELLIPSIS",
         "DBL_LBRACKET",
@@ -189,9 +211,216 @@ class Lexer:
         ".",
     ]
 
+    #
+    # Regexes for use in tokens (taken from pycparser)
+    #
+
+    hex_prefix = "0[xX]"
+    hex_digits = "[0-9a-fA-F]+"
+    bin_prefix = "0[bB]"
+    bin_digits = "[01]+"
+
+    # integer constants (K&R2: A.2.5.1)
+    integer_suffix_opt = (
+        r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?"
+    )
+    decimal_constant = (
+        "(0" + integer_suffix_opt + ")|([1-9][0-9]*" + integer_suffix_opt + ")"
+    )
+    octal_constant = "0[0-7]*" + integer_suffix_opt
+    hex_constant = hex_prefix + hex_digits + integer_suffix_opt
+    bin_constant = bin_prefix + bin_digits + integer_suffix_opt
+
+    bad_octal_constant = "0[0-7]*[89]"
+
+    # character constants (K&R2: A.2.5.2)
+    # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
+    # directives with Windows paths as filenames (..\..\dir\file)
+    # For the same reason, decimal_escape allows all digit sequences. We want to
+    # parse all correct code, even if it means to sometimes parse incorrect
+    # code.
+    #
+    # The original regexes were taken verbatim from the C syntax definition,
+    # and were later modified to avoid worst-case exponential running time.
+    #
+    #   simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
+    #   decimal_escape = r"""(\d+)"""
+    #   hex_escape = r"""(x[0-9a-fA-F]+)"""
+    #   bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
+    #
+    # The following modifications were made to avoid the ambiguity that allowed backtracking:
+    # (https://github.com/eliben/pycparser/issues/61)
+    #
+    # - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape.
+    # - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex
+    # - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal
+    # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape.
+    #
+    # Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways.
+    # e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`.
+
+    simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
+    decimal_escape = r"""(\d+)(?!\d)"""
+    hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
+    bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""
+
+    escape_sequence = (
+        r"""(\\(""" + simple_escape + "|" + decimal_escape + "|" + hex_escape + "))"
+    )
+
+    # This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed
+    # 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to
+
+    escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""
+
+    cconst_char = r"""([^'\\\n]|""" + escape_sequence + ")"
+    char_const = "'" + cconst_char + "'"
+    wchar_const = "L" + char_const
+    u8char_const = "u8" + char_const
+    u16char_const = "u" + char_const
+    u32char_const = "U" + char_const
+    multicharacter_constant = "'" + cconst_char + "{2,4}'"
+    unmatched_quote = "('" + cconst_char + "*\\n)|('" + cconst_char + "*$)"
+    bad_char_const = (
+        r"""('"""
+        + cconst_char
+        + """[^'\n]+')|('')|('"""
+        + bad_escape
+        + r"""[^'\n]*')"""
+    )
+
+    # string literals (K&R2: A.2.6)
+    string_char = r"""([^"\\\n]|""" + escape_sequence_start_in_string + ")"
+    string_literal = '"' + string_char + '*"'
+    wstring_literal = "L" + string_literal
+    u8string_literal = "u8" + string_literal
+    u16string_literal = "u" + string_literal
+    u32string_literal = "U" + string_literal
+    bad_string_literal = '"' + string_char + "*" + bad_escape + string_char + '*"'
+
+    # floating constants (K&R2: A.2.5.3)
+    exponent_part = r"""([eE][-+]?[0-9]+)"""
+    fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
+    floating_constant = (
+        "(((("
+        + fractional_constant
+        + ")"
+        + exponent_part
+        + "?)|([0-9]+"
+        + exponent_part
+        + "))[FfLl]?)"
+    )
+    binary_exponent_part = r"""([pP][+-]?[0-9]+)"""
+    hex_fractional_constant = (
+        "(((" + hex_digits + r""")?\.""" + hex_digits + ")|(" + hex_digits + r"""\.))"""
+    )
+    hex_floating_constant = (
+        "("
+        + hex_prefix
+        + "("
+        + hex_digits
+        + "|"
+        + hex_fractional_constant
+        + ")"
+        + binary_exponent_part
+        + "[FfLl]?)"
+    )
+
     t_ignore = " \t\r?@\f"
-    t_NUMBER = r"[0-9][0-9XxA-Fa-f]*"
-    t_FLOAT_NUMBER = r"[-+]?[0-9]*\.[0-9]+([eE][-+]?[0-9]+)?"
+
+    # The following floating and integer constants are defined as
+    # functions to impose a strict order (otherwise, decimal
+    # is placed before the others because its regex is longer,
+    # and this is bad)
+    #
+    @TOKEN(floating_constant)
+    def t_FLOAT_CONST(self, t: LexToken) -> LexToken:
+        return t
+
+    @TOKEN(hex_floating_constant)
+    def t_HEX_FLOAT_CONST(self, t: LexToken) -> LexToken:
+        return t
+
+    @TOKEN(hex_constant)
+    def t_INT_CONST_HEX(self, t: LexToken) -> LexToken:
+        return t
+
+    @TOKEN(bin_constant)
+    def t_INT_CONST_BIN(self, t: LexToken) -> LexToken:
+        return t
+
+    @TOKEN(bad_octal_constant)
+    def t_BAD_CONST_OCT(self, t: LexToken) -> None:
+        msg = "Invalid octal constant"
+        self._error(msg, t)
+
+    @TOKEN(octal_constant)
+    def t_INT_CONST_OCT(self, t: LexToken) -> LexToken:
+        return t
+
+    @TOKEN(decimal_constant)
+    def t_INT_CONST_DEC(self, t: LexToken) -> LexToken:
+        return t
+
+    # Must come before bad_char_const, to prevent it from
+    # catching valid char constants as invalid
+    #
+    @TOKEN(multicharacter_constant)
+    def t_INT_CONST_CHAR(self, t: LexToken) -> LexToken:
+        return t
+
+    @TOKEN(char_const)
+    def t_CHAR_CONST(self, t: LexToken) -> LexToken:
+        return t
+
+    @TOKEN(wchar_const)
+    def t_WCHAR_CONST(self, t: LexToken) -> LexToken:
+        return t
+
+    @TOKEN(u8char_const)
+    def t_U8CHAR_CONST(self, t: LexToken) -> LexToken:
+        return t
+
+    @TOKEN(u16char_const)
+    def t_U16CHAR_CONST(self, t: LexToken) -> LexToken:
+        return t
+
+    @TOKEN(u32char_const)
+    def t_U32CHAR_CONST(self, t: LexToken) -> LexToken:
+        return t
+
+    @TOKEN(unmatched_quote)
+    def t_UNMATCHED_QUOTE(self, t: LexToken) -> None:
+        msg = "Unmatched '"
+        self._error(msg, t)
+
+    @TOKEN(bad_char_const)
+    def t_BAD_CHAR_CONST(self, t: LexToken) -> None:
+        msg = "Invalid char constant %s" % t.value
+        self._error(msg, t)
+
+    @TOKEN(wstring_literal)
+    def t_WSTRING_LITERAL(self, t: LexToken) -> LexToken:
+        return t
+
+    @TOKEN(u8string_literal)
+    def t_U8STRING_LITERAL(self, t: LexToken) -> LexToken:
+        return t
+
+    @TOKEN(u16string_literal)
+    def t_U16STRING_LITERAL(self, t: LexToken) -> LexToken:
+        return t
+
+    @TOKEN(u32string_literal)
+    def t_U32STRING_LITERAL(self, t: LexToken) -> LexToken:
+        return t
+
+    # unmatched string literals are caught by the preprocessor
+
+    @TOKEN(bad_string_literal)
+    def t_BAD_STRING_LITERAL(self, t):
+        msg = "String contains invalid escape code"
+        self._error(msg, t)
 
     @TOKEN(r"[A-Za-z_~][A-Za-z0-9_]*")
     def t_NAME(self, t: LexToken) -> LexToken:
@@ -222,7 +451,6 @@ class Lexer:
         return t
 
     t_DIVIDE = r"/(?!/)"
-    t_CHAR_LITERAL = "'.'"
     t_ELLIPSIS = r"\.\.\."
     t_DBL_LBRACKET = r"\[\["
     t_DBL_RBRACKET = r"\]\]"
@@ -232,9 +460,7 @@ class Lexer:
     t_SHIFT_LEFT = r"<<"
     # SHIFT_RIGHT introduces ambiguity
 
-    # found at http://wordaligned.org/articles/string-literals-and-regular-expressions
-    # TODO: This does not work with the string "bla \" bla"
-    t_STRING_LITERAL = r'"([^"\\]|\\.)*"'
+    t_STRING_LITERAL = string_literal
 
     # Found at http://ostermiller.org/findcomment.html
     @TOKEN(r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/\n?")
diff --git a/cxxheaderparser/parser.py b/cxxheaderparser/parser.py
index 67d5a37..0be63e4 100644
--- a/cxxheaderparser/parser.py
+++ b/cxxheaderparser/parser.py
@@ -1171,7 +1171,7 @@ class CxxParser:
 
     def _parse_bitfield(self) -> int:
         # is a integral constant expression... for now, just do integers
-        tok = self._next_token_must_be("NUMBER")
+        tok = self._next_token_must_be("INT_CONST_DEC")
         return int(tok.value)
 
     def _parse_field(
diff --git a/cxxheaderparser/tokfmt.py b/cxxheaderparser/tokfmt.py
index 3fa1bf2..296c3d2 100644
--- a/cxxheaderparser/tokfmt.py
+++ b/cxxheaderparser/tokfmt.py
@@ -5,11 +5,24 @@ from .types import Token
 
 # key: token type, value: (left spacing, right spacing)
 _want_spacing = {
-    "NUMBER": (2, 2),
-    "FLOAT_NUMBER": (2, 2),
+    "FLOAT_CONST": (2, 2),
+    "HEX_FLOAT_CONST": (2, 2),
+    "INT_CONST_HEX": (2, 2),
+    "INT_CONST_BIN": (2, 2),
+    "INT_CONST_OCT": (2, 2),
+    "INT_CONST_DEC": (2, 2),
+    "INT_CONST_CHAR": (2, 2),
     "NAME": (2, 2),
-    "CHAR_LITERAL": (2, 2),
+    "CHAR_CONST": (2, 2),
+    "WCHAR_CONST": (2, 2),
+    "U8CHAR_CONST": (2, 2),
+    "U16CHAR_CONST": (2, 2),
+    "U32CHAR_CONST": (2, 2),
     "STRING_LITERAL": (2, 2),
+    "WSTRING_LITERAL": (2, 2),
+    "U8STRING_LITERAL": (2, 2),
+    "U16STRING_LITERAL": (2, 2),
+    "U32STRING_LITERAL": (2, 2),
     "ELLIPSIS": (2, 2),
     ">": (0, 2),
     ")": (0, 1),

From b54c80782427113120192861adffafa93df4552e Mon Sep 17 00:00:00 2001
From: Dustin Spicuzza <dustin@virtualroadside.com>
Date: Tue, 13 Dec 2022 23:18:14 -0500
Subject: [PATCH 4/9] Remove unused lexer.filenames

---
 cxxheaderparser/lexer.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py
index f8d3285..165f08e 100644
--- a/cxxheaderparser/lexer.py
+++ b/cxxheaderparser/lexer.py
@@ -432,11 +432,7 @@ class Lexer:
     def t_PRECOMP_MACRO(self, t: LexToken) -> typing.Optional[LexToken]:
         m = _line_re.match(t.value)
         if m:
-            filename = m.group(2)
-            if filename not in self._filenames_set:
-                self.filenames.append(filename)
-                self._filenames_set.add(filename)
-            self.filename = filename
+            self.filename = m.group(2)
 
             self.line_offset = 1 + self.lex.lineno - int(m.group(1))
             return None
@@ -508,13 +504,6 @@ class Lexer:
         self.filename = filename
         self.line_offset = 0
 
-        self.filenames: typing.List[str] = []
-        self._filenames_set: typing.Set[str] = set()
-
-        if filename:
-            self.filenames.append(filename)
-            self._filenames_set.add(filename)
-
         # Doxygen comments
         self.comments = []
 

From 079d643c67cae0e08ffed5cbca3b2280a2acc11a Mon Sep 17 00:00:00 2001
From: Dustin Spicuzza <dustin@virtualroadside.com>
Date: Tue, 13 Dec 2022 23:53:26 -0500
Subject: [PATCH 5/9] Update PLY package to include version

---
 cxxheaderparser/_ply/__init__.py | 5 +++++
 cxxheaderparser/_ply/lex.py      | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/cxxheaderparser/_ply/__init__.py b/cxxheaderparser/_ply/__init__.py
index e69de29..863ef54 100644
--- a/cxxheaderparser/_ply/__init__.py
+++ b/cxxheaderparser/_ply/__init__.py
@@ -0,0 +1,5 @@
+# PLY package
+# Author: David Beazley (dave@dabeaz.com)
+# https://github.com/dabeaz/ply
+
+__version__ = "2022.10.27"
diff --git a/cxxheaderparser/_ply/lex.py b/cxxheaderparser/_ply/lex.py
index 37a29b6..766a917 100644
--- a/cxxheaderparser/_ply/lex.py
+++ b/cxxheaderparser/_ply/lex.py
@@ -2,7 +2,7 @@
 # -----------------------------------------------------------------------------
 # ply: lex.py
 #
-# Copyright (C) 2001-2020
+# Copyright (C) 2001-2022
 # David M. Beazley (Dabeaz LLC)
 # All rights reserved.
 #

From 2ba5c3c829f840842e7d20a0a54031dbba41e857 Mon Sep 17 00:00:00 2001
From: Dustin Spicuzza <dustin@virtualroadside.com>
Date: Thu, 15 Dec 2022 00:50:08 -0500
Subject: [PATCH 6/9] Support extracting doxygen comments when declspec or
 attributes are present

---
 cxxheaderparser/parser.py | 11 ++++++++-
 tests/test_doxygen.py     | 47 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/cxxheaderparser/parser.py b/cxxheaderparser/parser.py
index 0be63e4..9a1e098 100644
--- a/cxxheaderparser/parser.py
+++ b/cxxheaderparser/parser.py
@@ -308,25 +308,34 @@ class CxxParser:
             ";": lambda _1, _2: None,
         }
 
+        _keep_doxygen = {"__declspec", "alignas", "__attribute__", "DBL_LBRACKET"}
+
         tok = None
 
         get_token_eof_ok = self.lex.token_eof_ok
         get_doxygen = self.lex.get_doxygen
 
+        doxygen = None
+
         try:
             while True:
                 tok = get_token_eof_ok()
                 if not tok:
                     break
 
-                doxygen = get_doxygen()
+                if doxygen is None:
+                    doxygen = get_doxygen()
 
                 fn = _translation_unit_tokens.get(tok.type)
                 if fn:
                     fn(tok, doxygen)
+
+                    if tok.type not in _keep_doxygen:
+                        doxygen = None
                 else:
                     # this processes ambiguous declarations
                     self._parse_declarations(tok, doxygen)
+                    doxygen = None
 
         except Exception as e:
             if self.verbose:
diff --git a/tests/test_doxygen.py b/tests/test_doxygen.py
index f82e615..88824b9 100644
--- a/tests/test_doxygen.py
+++ b/tests/test_doxygen.py
@@ -329,3 +329,50 @@ def test_doxygen_namespace() -> None:
             }
         )
     )
+
+
+def test_doxygen_declspec() -> None:
+    content = """
+      /// declspec comment
+      __declspec(thread) int i = 1;
+    """
+    data = parse_string(content, cleandoc=True)
+
+    assert data == ParsedData(
+        namespace=NamespaceScope(
+            variables=[
+                Variable(
+                    name=PQName(segments=[NameSpecifier(name="i")]),
+                    type=Type(
+                        typename=PQName(segments=[FundamentalSpecifier(name="int")])
+                    ),
+                    value=Value(tokens=[Token(value="1")]),
+                    doxygen="/// declspec comment",
+                )
+            ]
+        )
+    )
+
+
+def test_doxygen_attribute() -> None:
+    content = """
+      /// hasattr comment
+      [[nodiscard]]
+      int hasattr();
+    """
+    data = parse_string(content, cleandoc=True)
+
+    assert data == ParsedData(
+        namespace=NamespaceScope(
+            functions=[
+                Function(
+                    return_type=Type(
+                        typename=PQName(segments=[FundamentalSpecifier(name="int")])
+                    ),
+                    name=PQName(segments=[NameSpecifier(name="hasattr")]),
+                    parameters=[],
+                    doxygen="/// hasattr comment",
+                )
+            ]
+        )
+    )

From 40bf05b3844cfdc9e219bdd3e4e4b2e5b38c466a Mon Sep 17 00:00:00 2001
From: Dustin Spicuzza <dustin@virtualroadside.com>
Date: Thu, 15 Dec 2022 00:50:54 -0500
Subject: [PATCH 7/9] Add additional doxygen related testcases to make sure
 things don't accidentally break

---
 tests/test_doxygen.py | 69 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 65 insertions(+), 4 deletions(-)

diff --git a/tests/test_doxygen.py b/tests/test_doxygen.py
index 88824b9..dcc559d 100644
--- a/tests/test_doxygen.py
+++ b/tests/test_doxygen.py
@@ -209,16 +209,13 @@ def test_doxygen_fn_3slash() -> None:
     )
 
 
-def test_doxygen_fn_cstyle() -> None:
+def test_doxygen_fn_cstyle1() -> None:
     content = """
-      // clang-format off
-      
       /**
        * fn comment
        */
       void
       fn();
-      
     """
     data = parse_string(content, cleandoc=True)
 
@@ -238,6 +235,32 @@ def test_doxygen_fn_cstyle() -> None:
     )
 
 
+def test_doxygen_fn_cstyle2() -> None:
+    content = """
+      /*!
+      * fn comment
+      */
+      void
+      fn();
+    """
+    data = parse_string(content, cleandoc=True)
+
+    assert data == ParsedData(
+        namespace=NamespaceScope(
+            functions=[
+                Function(
+                    return_type=Type(
+                        typename=PQName(segments=[FundamentalSpecifier(name="void")])
+                    ),
+                    name=PQName(segments=[NameSpecifier(name="fn")]),
+                    parameters=[],
+                    doxygen="/*!\n* fn comment\n*/",
+                )
+            ]
+        )
+    )
+
+
 def test_doxygen_var_above() -> None:
     content = """
       // clang-format off
@@ -292,6 +315,44 @@ def test_doxygen_var_after() -> None:
     )
 
 
+def test_doxygen_multiple_variables() -> None:
+    content = """
+      int x; /// this is x
+      int y; /// this is y
+             /// this is also y
+      int z; /// this is z
+    """
+    data = parse_string(content, cleandoc=True)
+
+    assert data == ParsedData(
+        namespace=NamespaceScope(
+            variables=[
+                Variable(
+                    name=PQName(segments=[NameSpecifier(name="x")]),
+                    type=Type(
+                        typename=PQName(segments=[FundamentalSpecifier(name="int")])
+                    ),
+                    doxygen="/// this is x",
+                ),
+                Variable(
+                    name=PQName(segments=[NameSpecifier(name="y")]),
+                    type=Type(
+                        typename=PQName(segments=[FundamentalSpecifier(name="int")])
+                    ),
+                    doxygen="/// this is y\n/// this is also y",
+                ),
+                Variable(
+                    name=PQName(segments=[NameSpecifier(name="z")]),
+                    type=Type(
+                        typename=PQName(segments=[FundamentalSpecifier(name="int")])
+                    ),
+                    doxygen="/// this is z",
+                ),
+            ]
+        )
+    )
+
+
 def test_doxygen_namespace() -> None:
     content = """
       /**

From 1eaa85ae8d04fac47496bfe7f4cd6923418fc432 Mon Sep 17 00:00:00 2001
From: Dustin Spicuzza <dustin@virtualroadside.com>
Date: Thu, 15 Dec 2022 02:38:44 -0500
Subject: [PATCH 8/9] Split the lexer into PlyLexer and TokenStream components

- There are two types of token streams: file based, and list based
- I think this has better component separation
- Doxygen parsing is a bit weirder, but I think it's more straightforward to see all the pieces?
---
 cxxheaderparser/lexer.py  | 368 ++++++++++++++++++++++----------------
 cxxheaderparser/parser.py |  32 ++--
 cxxheaderparser/tokfmt.py |  10 +-
 tests/test_tokfmt.py      |   6 +-
 4 files changed, 246 insertions(+), 170 deletions(-)

diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py
index 165f08e..096882a 100644
--- a/cxxheaderparser/lexer.py
+++ b/cxxheaderparser/lexer.py
@@ -50,7 +50,7 @@ class LexToken(Protocol):
     location: Location
 
     #: private
-    lexer: "Lexer"
+    lexer: lex.Lexer
 
 
 PhonyEnding: LexToken = lex.LexToken()  # type: ignore
@@ -60,10 +60,13 @@ PhonyEnding.lineno = 0
 PhonyEnding.lexpos = 0
 
 
-class Lexer:
+class PlyLexer:
     """
     This lexer is a combination of pieces from the PLY lexers that CppHeaderParser
     and pycparser have.
+
+    This tokenizes the input into tokens. The other lexer classes do more complex
+    things with the tokens.
     """
 
     keywords = {
@@ -439,13 +442,6 @@ class Lexer:
         else:
             return t
 
-    @TOKEN(r"\/\/.*\n?")
-    def t_COMMENT_SINGLELINE(self, t: LexToken) -> LexToken:
-        if t.value.startswith("///") or t.value.startswith("//!"):
-            self.comments.append(t.value.lstrip("\t ").rstrip("\n"))
-        t.lexer.lineno += t.value.count("\n")
-        return t
-
     t_DIVIDE = r"/(?!/)"
     t_ELLIPSIS = r"\.\.\."
     t_DBL_LBRACKET = r"\[\["
@@ -458,22 +454,20 @@ class Lexer:
 
     t_STRING_LITERAL = string_literal
 
+    @TOKEN(r"\/\/.*\n?")
+    def t_COMMENT_SINGLELINE(self, t: LexToken) -> LexToken:
+        t.lexer.lineno += t.value.count("\n")
+        return t
+
     # Found at http://ostermiller.org/findcomment.html
     @TOKEN(r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/\n?")
     def t_COMMENT_MULTILINE(self, t: LexToken) -> LexToken:
-        if t.value.startswith("/**") or t.value.startswith("/*!"):
-            # not sure why, but get double new lines
-            v = t.value.replace("\n\n", "\n")
-            # strip prefixing whitespace
-            v = _multicomment_re.sub("\n*", v)
-            self.comments = v.splitlines()
         t.lexer.lineno += t.value.count("\n")
         return t
 
     @TOKEN(r"\n+")
     def t_NEWLINE(self, t: LexToken) -> LexToken:
         t.lexer.lineno += len(t.value)
-        del self.comments[:]
         return t
 
     def t_error(self, t: LexToken) -> None:
@@ -485,9 +479,8 @@ class Lexer:
 
     _lexer = None
     lex: lex.Lexer
-    lineno: int
 
-    def __new__(cls, *args, **kwargs) -> "Lexer":
+    def __new__(cls, *args, **kwargs) -> "PlyLexer":
         # only build the lexer once
         inst = super().__new__(cls)
         if cls._lexer is None:
@@ -499,157 +492,75 @@ class Lexer:
 
     def __init__(self, filename: typing.Optional[str] = None):
         self.input: typing.Callable[[str], None] = self.lex.input
+        self.token: typing.Callable[[], LexToken] = self.lex.token
 
         # For tracking current file/line position
         self.filename = filename
         self.line_offset = 0
 
-        # Doxygen comments
-        self.comments = []
+    def current_location(self) -> Location:
+        return Location(self.filename, self.lex.lineno - self.line_offset)
 
-        self.lookahead = typing.Deque[LexToken]()
 
-        # For 'set_group_of_tokens' support
-        self._get_token: typing.Callable[[], LexToken] = self.lex.token
-        self.lookahead_stack = typing.Deque[typing.Deque[LexToken]]()
+class TokenStream:
+    """
+    Provides access to a stream of tokens
+    """
+
+    tokbuf: typing.Deque[LexToken]
+
+    def _fill_tokbuf(self, tokbuf: typing.Deque[LexToken]) -> bool:
+        """
+        Fills tokbuf with tokens from the next line. Return True if at least
+        one token was added to the buffer
+        """
+        raise NotImplementedError
 
     def current_location(self) -> Location:
-        if self.lookahead:
-            return self.lookahead[0].location
-        return Location(self.filename, self.lex.lineno - self.line_offset)
+        raise NotImplementedError
 
     def get_doxygen(self) -> typing.Optional[str]:
         """
-        This should be called after the first element of something has
-        been consumed.
-
-        It will lookahead for comments that come after the item, if prior
-        comments don't exist.
+        This is called at the point that you want doxygen information
         """
+        raise NotImplementedError
 
-        # Assumption: This function is either called at the beginning of a
-        # statement or at the end of a statement
-
-        if self.comments:
-            comments = self.comments
-        else:
-            comments = []
-            # only look for comments until a newline (including lookahead)
-            for tok in self.lookahead:
-                if tok.type == "NEWLINE":
-                    return None
-
-            while True:
-                tok = self._get_token()
-                comments.extend(self.comments)
-
-                if tok is None:
-                    break
-
-                tok.location = Location(self.filename, tok.lineno - self.line_offset)
-                ttype = tok.type
-                if ttype == "NEWLINE":
-                    self.lookahead.append(tok)
-                    break
-
-                if ttype not in self._discard_types:
-                    self.lookahead.append(tok)
-
-                if ttype == "NAME":
-                    break
-
-                del self.comments[:]
-
-        comment_str = "\n".join(comments)
-        del self.comments[:]
-        if comment_str:
-            return comment_str
-
-        return None
+    def get_doxygen_after(self) -> typing.Optional[str]:
+        """
+        This is called to retrieve doxygen information after a statement
+        """
+        raise NotImplementedError
 
     _discard_types = {"NEWLINE", "COMMENT_SINGLELINE", "COMMENT_MULTILINE"}
 
-    def _token_limit_exceeded(self) -> typing.NoReturn:
-        from .errors import CxxParseError
-
-        raise CxxParseError("no more tokens left in this group")
-
-    @contextlib.contextmanager
-    def set_group_of_tokens(
-        self, toks: typing.List[LexToken]
-    ) -> typing.Generator[typing.Deque[LexToken], None, None]:
-        # intended for use when you have a set of tokens that you know
-        # must be consumed, such as a paren grouping or some type of
-        # lookahead case
-
-        stack = self.lookahead_stack
-        restore_fn = False
-
-        if not stack:
-            restore_fn = True
-            self._get_token = self._token_limit_exceeded
-
-        this_buf = typing.Deque[LexToken](toks)
-        prev_buf = self.lookahead
-        stack.append(prev_buf)
-        self.lookahead = this_buf
-
-        try:
-            yield this_buf
-        finally:
-            buf = stack.pop()
-            if prev_buf is not buf:
-                raise ValueError("internal error")
-
-            self.lookahead = prev_buf
-
-            if restore_fn:
-                self._get_token = self.lex.token
-
     def token(self) -> LexToken:
-        tok = None
-        while self.lookahead:
-            tok = self.lookahead.popleft()
-            if tok.type not in self._discard_types:
-                return tok
-
+        tokbuf = self.tokbuf
         while True:
-            tok = self._get_token()
-            if tok is None:
+            while tokbuf:
+                tok = tokbuf.popleft()
+                if tok.type not in self._discard_types:
+                    return tok
+
+            if not self._fill_tokbuf(tokbuf):
                 raise EOFError("unexpected end of file")
 
-            if tok.type not in self._discard_types:
-                tok.location = Location(self.filename, tok.lineno - self.line_offset)
-                break
-
-        return tok
-
     def token_eof_ok(self) -> typing.Optional[LexToken]:
-        tok = None
-        while self.lookahead:
-            tok = self.lookahead.popleft()
-            if tok.type not in self._discard_types:
-                return tok
-
+        tokbuf = self.tokbuf
         while True:
-            tok = self._get_token()
-            if tok is None:
-                break
+            while tokbuf:
+                tok = tokbuf.popleft()
+                if tok.type not in self._discard_types:
+                    return tok
 
-            if tok.type not in self._discard_types:
-                tok.location = Location(self.filename, tok.lineno - self.line_offset)
-                break
-
-        return tok
+            if not self._fill_tokbuf(tokbuf):
+                return None
 
     def token_if(self, *types: str) -> typing.Optional[LexToken]:
         tok = self.token_eof_ok()
         if tok is None:
             return None
         if tok.type not in types:
-            # put it back on the left in case it was retrieved
-            # from the lookahead buffer
-            self.lookahead.appendleft(tok)
+            self.tokbuf.appendleft(tok)
             return None
         return tok
 
@@ -658,9 +569,7 @@ class Lexer:
         if tok is None:
             return None
         if tok.type not in types:
-            # put it back on the left in case it was retrieved
-            # from the lookahead buffer
-            self.lookahead.appendleft(tok)
+            self.tokbuf.appendleft(tok)
             return None
         return tok
 
@@ -669,9 +578,7 @@ class Lexer:
         if tok is None:
             return None
         if tok.value not in vals:
-            # put it back on the left in case it was retrieved
-            # from the lookahead buffer
-            self.lookahead.appendleft(tok)
+            self.tokbuf.appendleft(tok)
             return None
         return tok
 
@@ -680,9 +587,7 @@ class Lexer:
         if tok is None:
             return None
         if tok.type in types:
-            # put it back on the left in case it was retrieved
-            # from the lookahead buffer
-            self.lookahead.appendleft(tok)
+            self.tokbuf.appendleft(tok)
             return None
         return tok
 
@@ -690,18 +595,177 @@ class Lexer:
         tok = self.token_eof_ok()
         if not tok:
             return False
-        self.lookahead.appendleft(tok)
+        self.tokbuf.appendleft(tok)
         return tok.type in types
 
     def return_token(self, tok: LexToken) -> None:
-        self.lookahead.appendleft(tok)
+        self.tokbuf.appendleft(tok)
 
     def return_tokens(self, toks: typing.Sequence[LexToken]) -> None:
-        self.lookahead.extendleft(reversed(toks))
+        self.tokbuf.extendleft(reversed(toks))
+
+
+class LexerTokenStream(TokenStream):
+    """
+    Provides tokens from using PlyLexer on the given input text
+    """
+
+    def __init__(self, filename: typing.Optional[str], content: str) -> None:
+        self._lex = PlyLexer(filename)
+        self._lex.input(content)
+        self.tokbuf = typing.Deque[LexToken]()
+
+    def _fill_tokbuf(self, tokbuf: typing.Deque[LexToken]) -> bool:
+        get_token = self._lex.token
+        tokbuf = self.tokbuf
+
+        tok = get_token()
+        if tok is None:
+            return False
+
+        while True:
+            tok.location = self._lex.current_location()
+            tokbuf.append(tok)
+
+            if tok.type == "NEWLINE":
+                break
+
+            tok = get_token()
+            if tok is None:
+                break
+
+        return True
+
+    def current_location(self) -> Location:
+        if self.tokbuf:
+            return self.tokbuf[0].location
+        return self._lex.current_location()
+
+    def get_doxygen(self) -> typing.Optional[str]:
+
+        tokbuf = self.tokbuf
+
+        # fill the token buffer if it's empty (which indicates a newline)
+        if not tokbuf and not self._fill_tokbuf(tokbuf):
+            return None
+
+        comments: typing.List[LexToken] = []
+
+        # retrieve any comments in the stream right before
+        # the first non-discard element
+        keep_going = True
+        while True:
+            while tokbuf:
+                tok = tokbuf.popleft()
+                if tok.type == "NEWLINE":
+                    comments.clear()
+                elif tok.type in ("COMMENT_SINGLELINE", "COMMENT_MULTILINE"):
+                    comments.append(tok)
+                else:
+                    tokbuf.appendleft(tok)
+                    keep_going = False
+                    break
+
+            if not keep_going:
+                break
+
+            if not self._fill_tokbuf(tokbuf):
+                break
+
+        if comments:
+            return self._extract_comments(comments)
+
+        return None
+
+    def get_doxygen_after(self) -> typing.Optional[str]:
+        tokbuf = self.tokbuf
+
+        # if there's a newline directly after a statement, we're done
+        if not tokbuf:
+            return None
+
+        # retrieve comments after non-discard elements
+        comments: typing.List[LexToken] = []
+        new_tokbuf = typing.Deque[LexToken]()
+
+        # This is different: we only extract tokens here
+        while tokbuf:
+            tok = tokbuf.popleft()
+            if tok.type == "NEWLINE":
+                break
+            elif tok.type in ("COMMENT_SINGLELINE", "COMMENT_MULTILINE"):
+                comments.append(tok)
+            else:
+                new_tokbuf.append(tok)
+                if comments:
+                    break
+
+        new_tokbuf.extend(tokbuf)
+        self.tokbuf = new_tokbuf
+
+        if comments:
+            return self._extract_comments(comments)
+
+        return None
+
+    def _extract_comments(self, comments: typing.List[LexToken]):
+        # Now we have comments, need to extract the text from them
+        comment_lines: typing.List[str] = []
+        for c in comments:
+            text = c.value
+            if c.type == "COMMENT_SINGLELINE":
+                if text.startswith("///") or text.startswith("//!"):
+                    comment_lines.append(text.rstrip("\n"))
+            else:
+                if text.startswith("/**") or text.startswith("/*!"):
+                    # not sure why, but get double new lines
+                    text = text.replace("\n\n", "\n")
+                    # strip prefixing whitespace
+                    text = _multicomment_re.sub("\n*", text)
+                    comment_lines = text.splitlines()
+
+        comment_str = "\n".join(comment_lines)
+        if comment_str:
+            return comment_str
+
+        return None
+
+
+class BoundedTokenStream(TokenStream):
+    """
+    Provides tokens from a fixed list of tokens.
+
+    Intended for use when you have a group of tokens that you know
+    must be consumed, such as a paren grouping or some type of
+    lookahead case
+    """
+
+    def __init__(self, toks: typing.List[LexToken]) -> None:
+        self.tokbuf = typing.Deque[LexToken](toks)
+
+    def has_tokens(self) -> bool:
+        return len(self.tokbuf) > 0
+
+    def _fill_tokbuf(self, tokbuf: typing.Deque[LexToken]) -> bool:
+        from .errors import CxxParseError
+
+        raise CxxParseError("no more tokens left in this group")
+
+    def current_location(self) -> Location:
+        if self.tokbuf:
+            return self.tokbuf[0].location
+        raise ValueError("internal error")
+
+    def get_doxygen(self) -> typing.Optional[str]:
+        # comment tokens aren't going to be in this stream
+        return None
+
+    def get_doxygen_after(self) -> typing.Optional[str]:
+        return None
 
 
 if __name__ == "__main__":  # pragma: no cover
     try:
-        lex.runmain(lexer=Lexer(None))
+        lex.runmain(lexer=PlyLexer(None))
     except EOFError:
         pass
diff --git a/cxxheaderparser/parser.py b/cxxheaderparser/parser.py
index 9a1e098..6c6e764 100644
--- a/cxxheaderparser/parser.py
+++ b/cxxheaderparser/parser.py
@@ -4,8 +4,9 @@ import inspect
 import re
 import typing
 
+from . import lexer
 from .errors import CxxParseError
-from .lexer import Lexer, LexToken, Location, PhonyEnding
+from .lexer import LexToken, Location, PhonyEnding
 from .options import ParserOptions
 from .parserstate import (
     ClassBlockState,
@@ -80,8 +81,7 @@ class CxxParser:
         self.visitor = visitor
         self.filename = filename
 
-        self.lex = Lexer(filename)
-        self.lex.input(content)
+        self.lex: lexer.TokenStream = lexer.LexerTokenStream(filename, content)
 
         global_ns = NamespaceDecl([], False)
         self.current_namespace = global_ns
@@ -319,13 +319,13 @@ class CxxParser:
 
         try:
             while True:
+                if doxygen is None:
+                    doxygen = get_doxygen()
+
                 tok = get_token_eof_ok()
                 if not tok:
                     break
 
-                if doxygen is None:
-                    doxygen = get_doxygen()
-
                 fn = _translation_unit_tokens.get(tok.type)
                 if fn:
                     fn(tok, doxygen)
@@ -619,7 +619,12 @@ class CxxParser:
                 # append a token to make other parsing components happy
                 raw_toks.append(PhonyEnding)
 
-                with self.lex.set_group_of_tokens(raw_toks) as remainder:
+                old_lex = self.lex
+                try:
+                    # set up a temporary token stream with the tokens we need to parse
+                    tmp_lex = lexer.BoundedTokenStream(raw_toks)
+                    self.lex = tmp_lex
+
                     try:
                         parsed_type, mods = self._parse_type(None)
                         if parsed_type is None:
@@ -631,9 +636,12 @@ class CxxParser:
                     except CxxParseError:
                         dtype = None
                     else:
-                        if remainder:
+                        if tmp_lex.has_tokens():
                             dtype = None
 
+                finally:
+                    self.lex = old_lex
+
             if self.lex.token_if("ELLIPSIS"):
                 param_pack = True
 
@@ -948,12 +956,16 @@ class CxxParser:
         values: typing.List[Enumerator] = []
 
         while True:
+            doxygen = self.lex.get_doxygen()
+
             name_tok = self._next_token_must_be("}", "NAME")
             if name_tok.value == "}":
                 break
 
+            if doxygen is None:
+                doxygen = self.lex.get_doxygen_after()
+
             name = name_tok.value
-            doxygen = self.lex.get_doxygen()
             value = None
 
             tok = self._next_token_must_be("}", ",", "=", "DBL_LBRACKET")
@@ -1253,7 +1265,7 @@ class CxxParser:
 
         if doxygen is None:
             # try checking after the var
-            doxygen = self.lex.get_doxygen()
+            doxygen = self.lex.get_doxygen_after()
 
         if is_typedef:
             if not name:
diff --git a/cxxheaderparser/tokfmt.py b/cxxheaderparser/tokfmt.py
index 296c3d2..f2bb67c 100644
--- a/cxxheaderparser/tokfmt.py
+++ b/cxxheaderparser/tokfmt.py
@@ -1,6 +1,6 @@
 import typing
 
-from .lexer import LexToken, Lexer
+from .lexer import LexToken, PlyLexer, LexerTokenStream
 from .types import Token
 
 # key: token type, value: (left spacing, right spacing)
@@ -32,7 +32,7 @@ _want_spacing = {
     "&": (0, 2),
 }
 
-_want_spacing.update(dict.fromkeys(Lexer.keywords, (2, 2)))
+_want_spacing.update(dict.fromkeys(PlyLexer.keywords, (2, 2)))
 
 
 def tokfmt(toks: typing.List[Token]) -> str:
@@ -67,9 +67,9 @@ if __name__ == "__main__":  # pragma: no cover
     parser.add_argument("header")
     args = parser.parse_args()
 
-    lexer = Lexer(args.header)
-    with open(lexer.filename) as fp:
-        lexer.input(fp.read())  # type: ignore
+    filename: str = args.header
+    with open(filename) as fp:
+        lexer = LexerTokenStream(filename, fp.read())
 
     toks: typing.List[Token] = []
     while True:
diff --git a/tests/test_tokfmt.py b/tests/test_tokfmt.py
index ba245c3..758b9f6 100644
--- a/tests/test_tokfmt.py
+++ b/tests/test_tokfmt.py
@@ -1,6 +1,6 @@
 import pytest
 
-from cxxheaderparser.lexer import Lexer
+from cxxheaderparser.lexer import PlyLexer
 from cxxheaderparser.tokfmt import tokfmt
 from cxxheaderparser.types import Token
 
@@ -40,11 +40,11 @@ def test_tokfmt(instr: str) -> None:
     Each input string is exactly what the output of tokfmt should be
     """
     toks = []
-    lexer = Lexer("")
+    lexer = PlyLexer("")
     lexer.input(instr)
 
     while True:
-        tok = lexer.token_eof_ok()
+        tok = lexer.token()
         if not tok:
             break
 

From e5295070a036e628f42a25c6d453f21733bf3736 Mon Sep 17 00:00:00 2001
From: Dustin Spicuzza <dustin@virtualroadside.com>
Date: Thu, 15 Dec 2022 02:55:07 -0500
Subject: [PATCH 9/9] Add support for parsing user defined literals

---
 cxxheaderparser/lexer.py | 51 ++++++++++++++++++++++++++++++++++++++--
 tests/test_misc.py       | 31 ++++++++++++++++++++++++
 tests/test_tokfmt.py     |  5 ++--
 3 files changed, 83 insertions(+), 4 deletions(-)

diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py
index 096882a..075e2cf 100644
--- a/cxxheaderparser/lexer.py
+++ b/cxxheaderparser/lexer.py
@@ -179,6 +179,7 @@ class PlyLexer:
         # misc
         "DIVIDE",
         "NEWLINE",
+        "WHITESPACE",
         "ELLIPSIS",
         "DBL_LBRACKET",
         "DBL_RBRACKET",
@@ -329,7 +330,8 @@ class PlyLexer:
         + "[FfLl]?)"
     )
 
-    t_ignore = " \t\r?@\f"
+    t_WHITESPACE = "[ \t]+"
+    t_ignore = "\r"
 
     # The following floating and integer constants are defined as
     # functions to impose a strict order (otherwise, decimal
@@ -531,7 +533,12 @@ class TokenStream:
         """
         raise NotImplementedError
 
-    _discard_types = {"NEWLINE", "COMMENT_SINGLELINE", "COMMENT_MULTILINE"}
+    _discard_types = {
+        "NEWLINE",
+        "COMMENT_SINGLELINE",
+        "COMMENT_MULTILINE",
+        "WHITESPACE",
+    }
 
     def token(self) -> LexToken:
         tokbuf = self.tokbuf
@@ -610,6 +617,27 @@ class LexerTokenStream(TokenStream):
     Provides tokens from using PlyLexer on the given input text
     """
 
+    _user_defined_literal_start = {
+        "FLOAT_CONST",
+        "HEX_FLOAT_CONST",
+        "INT_CONST_HEX",
+        "INT_CONST_BIN",
+        "INT_CONST_OCT",
+        "INT_CONST_DEC",
+        "INT_CONST_CHAR",
+        "CHAR_CONST",
+        "WCHAR_CONST",
+        "U8CHAR_CONST",
+        "U16CHAR_CONST",
+        "U32CHAR_CONST",
+        # String literals
+        "STRING_LITERAL",
+        "WSTRING_LITERAL",
+        "U8STRING_LITERAL",
+        "U16STRING_LITERAL",
+        "U32STRING_LITERAL",
+    }
+
     def __init__(self, filename: typing.Optional[str], content: str) -> None:
         self._lex = PlyLexer(filename)
         self._lex.input(content)
@@ -623,6 +651,8 @@ class LexerTokenStream(TokenStream):
         if tok is None:
             return False
 
+        udl_start = self._user_defined_literal_start
+
         while True:
             tok.location = self._lex.current_location()
             tokbuf.append(tok)
@@ -630,6 +660,19 @@ class LexerTokenStream(TokenStream):
             if tok.type == "NEWLINE":
                 break
 
+            # detect/combine user defined literals
+            if tok.type in udl_start:
+                tok2 = get_token()
+                if tok2 is None:
+                    break
+
+                if tok2.type != "NAME" or tok2.value[0] != "_":
+                    tok = tok2
+                    continue
+
+                tok.value = tok.value + tok2.value
+                tok.type = f"UD_{tok.type}"
+
             tok = get_token()
             if tok is None:
                 break
@@ -659,6 +702,8 @@ class LexerTokenStream(TokenStream):
                 tok = tokbuf.popleft()
                 if tok.type == "NEWLINE":
                     comments.clear()
+                elif tok.type == "WHITESPACE":
+                    pass
                 elif tok.type in ("COMMENT_SINGLELINE", "COMMENT_MULTILINE"):
                     comments.append(tok)
                 else:
@@ -693,6 +738,8 @@ class LexerTokenStream(TokenStream):
             tok = tokbuf.popleft()
             if tok.type == "NEWLINE":
                 break
+            elif tok.type == "WHITESPACE":
+                new_tokbuf.append(tok)
             elif tok.type in ("COMMENT_SINGLELINE", "COMMENT_MULTILINE"):
                 comments.append(tok)
             else:
diff --git a/tests/test_misc.py b/tests/test_misc.py
index b90fd96..81d4a0e 100644
--- a/tests/test_misc.py
+++ b/tests/test_misc.py
@@ -236,3 +236,34 @@ def test_final() -> None:
             ],
         )
     )
+
+
+#
+# User defined literals
+#
+
+
+def test_user_defined_literal() -> None:
+    content = """
+      units::volt_t v = 1_V;
+    """
+    data = parse_string(content, cleandoc=True)
+
+    assert data == ParsedData(
+        namespace=NamespaceScope(
+            variables=[
+                Variable(
+                    name=PQName(segments=[NameSpecifier(name="v")]),
+                    type=Type(
+                        typename=PQName(
+                            segments=[
+                                NameSpecifier(name="units"),
+                                NameSpecifier(name="volt_t"),
+                            ]
+                        )
+                    ),
+                    value=Value(tokens=[Token(value="1_V")]),
+                )
+            ]
+        )
+    )
diff --git a/tests/test_tokfmt.py b/tests/test_tokfmt.py
index 758b9f6..cc0b379 100644
--- a/tests/test_tokfmt.py
+++ b/tests/test_tokfmt.py
@@ -1,6 +1,6 @@
 import pytest
 
-from cxxheaderparser.lexer import PlyLexer
+from cxxheaderparser.lexer import PlyLexer, LexerTokenStream
 from cxxheaderparser.tokfmt import tokfmt
 from cxxheaderparser.types import Token
 
@@ -48,6 +48,7 @@ def test_tokfmt(instr: str) -> None:
         if not tok:
             break
 
-        toks.append(Token(tok.value, tok.type))
+        if tok.type not in LexerTokenStream._discard_types:
+            toks.append(Token(tok.value, tok.type))
 
     assert tokfmt(toks) == instr