From b47eb7ce10c7f5e67bd9059ac541e837ad9d2049 Mon Sep 17 00:00:00 2001 From: Dustin Spicuzza Date: Sat, 10 Dec 2022 14:08:59 -0500 Subject: [PATCH 1/9] Use lex.TOKEN decorator for lexer tokens instead of docstrings - Allows usage with -OO --- cxxheaderparser/lexer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py index 9650190..83032a7 100644 --- a/cxxheaderparser/lexer.py +++ b/cxxheaderparser/lexer.py @@ -5,6 +5,7 @@ import typing import sys from ._ply import lex +from ._ply.lex import TOKEN if sys.version_info >= (3, 8): @@ -186,14 +187,14 @@ class Lexer: t_NUMBER = r"[0-9][0-9XxA-Fa-f]*" t_FLOAT_NUMBER = r"[-+]?[0-9]*\.[0-9]+([eE][-+]?[0-9]+)?" + @TOKEN(r"[A-Za-z_~][A-Za-z0-9_]*") def t_NAME(self, t: LexToken) -> LexToken: - r"[A-Za-z_~][A-Za-z0-9_]*" if t.value in self.keywords: t.type = t.value return t + @TOKEN(r"\#.*") def t_PRECOMP_MACRO(self, t: LexToken) -> typing.Optional[LexToken]: - r"\#.*" m = _line_re.match(t.value) if m: filename = m.group(2) @@ -207,8 +208,8 @@ class Lexer: else: return t + @TOKEN(r"\/\/.*\n?") def t_COMMENT_SINGLELINE(self, t: LexToken) -> LexToken: - r"\/\/.*\n?" if t.value.startswith("///") or t.value.startswith("//!"): self.comments.append(t.value.lstrip("\t ").rstrip("\n")) t.lexer.lineno += t.value.count("\n") @@ -230,8 +231,8 @@ class Lexer: t_STRING_LITERAL = r'"([^"\\]|\\.)*"' # Found at http://ostermiller.org/findcomment.html + @TOKEN(r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/\n?") def t_COMMENT_MULTILINE(self, t: LexToken) -> LexToken: - r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/\n?" if t.value.startswith("/**") or t.value.startswith("/*!"): # not sure why, but get double new lines v = t.value.replace("\n\n", "\n") @@ -241,8 +242,8 @@ class Lexer: t.lexer.lineno += t.value.count("\n") return t + @TOKEN(r"\n+") def t_NEWLINE(self, t: LexToken) -> LexToken: - r"\n+" t.lexer.lineno += len(t.value) del self.comments[:] return t From 03c24a207440dbcbb7084f3d0c9838990cb65605 Mon Sep 17 00:00:00 2001 From: Dustin Spicuzza Date: Sat, 10 Dec 2022 14:48:20 -0500 Subject: [PATCH 2/9] Better lexer error handling --- cxxheaderparser/errors.py | 3 ++- cxxheaderparser/lexer.py | 12 +++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/cxxheaderparser/errors.py b/cxxheaderparser/errors.py index e08b510..d457f51 100644 --- a/cxxheaderparser/errors.py +++ b/cxxheaderparser/errors.py @@ -1,6 +1,7 @@ import typing -from .lexer import LexToken +if typing.TYPE_CHECKING: + from .lexer import LexToken class CxxParseError(Exception): diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py index 83032a7..36c6c34 100644 --- a/cxxheaderparser/lexer.py +++ b/cxxheaderparser/lexer.py @@ -7,6 +7,12 @@ import sys from ._ply import lex from ._ply.lex import TOKEN +from .errors import CxxParseError + + +class LexError(CxxParseError): + pass + if sys.version_info >= (3, 8): from typing import Protocol @@ -249,7 +255,11 @@ class Lexer: return t def t_error(self, t: LexToken) -> None: - print("Lex error: ", t) + self._error(f"Illegal character {t.value!r}", t) + + def _error(self, msg: str, tok: LexToken): + tok.location = self.current_location() + raise LexError(msg, tok) _lexer = None lex: lex.Lexer From aee776072eac004dc8a08d53e95a62090544dcd9 Mon Sep 17 00:00:00 2001 From: Dustin Spicuzza Date: Sat, 10 Dec 2022 15:16:05 -0500 Subject: [PATCH 3/9] Grab string/character lexer constants from pycparser --- LICENSE.txt | 30 ++++- cxxheaderparser/lexer.py | 246 ++++++++++++++++++++++++++++++++++++-- cxxheaderparser/parser.py | 2 +- cxxheaderparser/tokfmt.py | 19 ++- 4 files changed, 282 insertions(+), 15 deletions(-) diff --git a/LICENSE.txt b/LICENSE.txt index 1d8f05b..5d5e3a0 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ cxxheaderparser license: -Copyright (c) 2020 Dustin Spicuzza +Copyright (c) 2020-2022 Dustin Spicuzza All rights reserved. Redistribution and use in source and binary forms, with or without @@ -102,3 +102,31 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------------- + +pycparser -- A C parser in Python + +Copyright (c) 2008-2022, Eli Bendersky +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py index 36c6c34..f8d3285 100644 --- a/cxxheaderparser/lexer.py +++ b/cxxheaderparser/lexer.py @@ -61,6 +61,10 @@ PhonyEnding.lexpos = 0 class Lexer: + """ + This lexer is a combination of pieces from the PLY lexers that CppHeaderParser + and pycparser have. + """ keywords = { "__attribute__", @@ -144,15 +148,33 @@ class Lexer: } tokens = [ - "NUMBER", - "FLOAT_NUMBER", + # constants + "FLOAT_CONST", + "HEX_FLOAT_CONST", + "INT_CONST_HEX", + "INT_CONST_BIN", + "INT_CONST_OCT", + "INT_CONST_DEC", + "INT_CONST_CHAR", + "CHAR_CONST", + "WCHAR_CONST", + "U8CHAR_CONST", + "U16CHAR_CONST", + "U32CHAR_CONST", + # String literals + "STRING_LITERAL", + "WSTRING_LITERAL", + "U8STRING_LITERAL", + "U16STRING_LITERAL", + "U32STRING_LITERAL", + # "NAME", + # Comments "COMMENT_SINGLELINE", "COMMENT_MULTILINE", "PRECOMP_MACRO", + # misc "DIVIDE", - "CHAR_LITERAL", - "STRING_LITERAL", "NEWLINE", "ELLIPSIS", "DBL_LBRACKET", @@ -189,9 +211,216 @@ class Lexer: ".", ] + # + # Regexes for use in tokens (taken from pycparser) + # + + hex_prefix = "0[xX]" + hex_digits = "[0-9a-fA-F]+" + bin_prefix = "0[bB]" + bin_digits = "[01]+" + + # integer constants (K&R2: A.2.5.1) + integer_suffix_opt = ( + r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?" + ) + decimal_constant = ( + "(0" + integer_suffix_opt + ")|([1-9][0-9]*" + integer_suffix_opt + ")" + ) + octal_constant = "0[0-7]*" + integer_suffix_opt + hex_constant = hex_prefix + hex_digits + integer_suffix_opt + bin_constant = bin_prefix + bin_digits + integer_suffix_opt + + bad_octal_constant = "0[0-7]*[89]" + + # character constants (K&R2: A.2.5.2) + # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line + # directives with Windows paths as filenames (..\..\dir\file) + # For the same reason, decimal_escape allows all digit sequences. We want to + # parse all correct code, even if it means to sometimes parse incorrect + # code. + # + # The original regexes were taken verbatim from the C syntax definition, + # and were later modified to avoid worst-case exponential running time. + # + # simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" + # decimal_escape = r"""(\d+)""" + # hex_escape = r"""(x[0-9a-fA-F]+)""" + # bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" + # + # The following modifications were made to avoid the ambiguity that allowed backtracking: + # (https://github.com/eliben/pycparser/issues/61) + # + # - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape. + # - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex + # - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal + # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape. + # + # Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways. + # e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`. + + simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))""" + decimal_escape = r"""(\d+)(?!\d)""" + hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])""" + bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])""" + + escape_sequence = ( + r"""(\\(""" + simple_escape + "|" + decimal_escape + "|" + hex_escape + "))" + ) + + # This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed + # 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to + + escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])""" + + cconst_char = r"""([^'\\\n]|""" + escape_sequence + ")" + char_const = "'" + cconst_char + "'" + wchar_const = "L" + char_const + u8char_const = "u8" + char_const + u16char_const = "u" + char_const + u32char_const = "U" + char_const + multicharacter_constant = "'" + cconst_char + "{2,4}'" + unmatched_quote = "('" + cconst_char + "*\\n)|('" + cconst_char + "*$)" + bad_char_const = ( + r"""('""" + + cconst_char + + """[^'\n]+')|('')|('""" + + bad_escape + + r"""[^'\n]*')""" + ) + + # string literals (K&R2: A.2.6) + string_char = r"""([^"\\\n]|""" + escape_sequence_start_in_string + ")" + string_literal = '"' + string_char + '*"' + wstring_literal = "L" + string_literal + u8string_literal = "u8" + string_literal + u16string_literal = "u" + string_literal + u32string_literal = "U" + string_literal + bad_string_literal = '"' + string_char + "*" + bad_escape + string_char + '*"' + + # floating constants (K&R2: A.2.5.3) + exponent_part = r"""([eE][-+]?[0-9]+)""" + fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" + floating_constant = ( + "((((" + + fractional_constant + + ")" + + exponent_part + + "?)|([0-9]+" + + exponent_part + + "))[FfLl]?)" + ) + binary_exponent_part = r"""([pP][+-]?[0-9]+)""" + hex_fractional_constant = ( + "(((" + hex_digits + r""")?\.""" + hex_digits + ")|(" + hex_digits + r"""\.))""" + ) + hex_floating_constant = ( + "(" + + hex_prefix + + "(" + + hex_digits + + "|" + + hex_fractional_constant + + ")" + + binary_exponent_part + + "[FfLl]?)" + ) + t_ignore = " \t\r?@\f" - t_NUMBER = r"[0-9][0-9XxA-Fa-f]*" - t_FLOAT_NUMBER = r"[-+]?[0-9]*\.[0-9]+([eE][-+]?[0-9]+)?" + + # The following floating and integer constants are defined as + # functions to impose a strict order (otherwise, decimal + # is placed before the others because its regex is longer, + # and this is bad) + # + @TOKEN(floating_constant) + def t_FLOAT_CONST(self, t: LexToken) -> LexToken: + return t + + @TOKEN(hex_floating_constant) + def t_HEX_FLOAT_CONST(self, t: LexToken) -> LexToken: + return t + + @TOKEN(hex_constant) + def t_INT_CONST_HEX(self, t: LexToken) -> LexToken: + return t + + @TOKEN(bin_constant) + def t_INT_CONST_BIN(self, t: LexToken) -> LexToken: + return t + + @TOKEN(bad_octal_constant) + def t_BAD_CONST_OCT(self, t: LexToken) -> None: + msg = "Invalid octal constant" + self._error(msg, t) + + @TOKEN(octal_constant) + def t_INT_CONST_OCT(self, t: LexToken) -> LexToken: + return t + + @TOKEN(decimal_constant) + def t_INT_CONST_DEC(self, t: LexToken) -> LexToken: + return t + + # Must come before bad_char_const, to prevent it from + # catching valid char constants as invalid + # + @TOKEN(multicharacter_constant) + def t_INT_CONST_CHAR(self, t: LexToken) -> LexToken: + return t + + @TOKEN(char_const) + def t_CHAR_CONST(self, t: LexToken) -> LexToken: + return t + + @TOKEN(wchar_const) + def t_WCHAR_CONST(self, t: LexToken) -> LexToken: + return t + + @TOKEN(u8char_const) + def t_U8CHAR_CONST(self, t: LexToken) -> LexToken: + return t + + @TOKEN(u16char_const) + def t_U16CHAR_CONST(self, t: LexToken) -> LexToken: + return t + + @TOKEN(u32char_const) + def t_U32CHAR_CONST(self, t: LexToken) -> LexToken: + return t + + @TOKEN(unmatched_quote) + def t_UNMATCHED_QUOTE(self, t: LexToken) -> None: + msg = "Unmatched '" + self._error(msg, t) + + @TOKEN(bad_char_const) + def t_BAD_CHAR_CONST(self, t: LexToken) -> None: + msg = "Invalid char constant %s" % t.value + self._error(msg, t) + + @TOKEN(wstring_literal) + def t_WSTRING_LITERAL(self, t: LexToken) -> LexToken: + return t + + @TOKEN(u8string_literal) + def t_U8STRING_LITERAL(self, t: LexToken) -> LexToken: + return t + + @TOKEN(u16string_literal) + def t_U16STRING_LITERAL(self, t: LexToken) -> LexToken: + return t + + @TOKEN(u32string_literal) + def t_U32STRING_LITERAL(self, t: LexToken) -> LexToken: + return t + + # unmatched string literals are caught by the preprocessor + + @TOKEN(bad_string_literal) + def t_BAD_STRING_LITERAL(self, t): + msg = "String contains invalid escape code" + self._error(msg, t) @TOKEN(r"[A-Za-z_~][A-Za-z0-9_]*") def t_NAME(self, t: LexToken) -> LexToken: @@ -222,7 +451,6 @@ class Lexer: return t t_DIVIDE = r"/(?!/)" - t_CHAR_LITERAL = "'.'" t_ELLIPSIS = r"\.\.\." t_DBL_LBRACKET = r"\[\[" t_DBL_RBRACKET = r"\]\]" @@ -232,9 +460,7 @@ class Lexer: t_SHIFT_LEFT = r"<<" # SHIFT_RIGHT introduces ambiguity - # found at http://wordaligned.org/articles/string-literals-and-regular-expressions - # TODO: This does not work with the string "bla \" bla" - t_STRING_LITERAL = r'"([^"\\]|\\.)*"' + t_STRING_LITERAL = string_literal # Found at http://ostermiller.org/findcomment.html @TOKEN(r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/\n?") diff --git a/cxxheaderparser/parser.py b/cxxheaderparser/parser.py index 67d5a37..0be63e4 100644 --- a/cxxheaderparser/parser.py +++ b/cxxheaderparser/parser.py @@ -1171,7 +1171,7 @@ class CxxParser: def _parse_bitfield(self) -> int: # is a integral constant expression... for now, just do integers - tok = self._next_token_must_be("NUMBER") + tok = self._next_token_must_be("INT_CONST_DEC") return int(tok.value) def _parse_field( diff --git a/cxxheaderparser/tokfmt.py b/cxxheaderparser/tokfmt.py index 3fa1bf2..296c3d2 100644 --- a/cxxheaderparser/tokfmt.py +++ b/cxxheaderparser/tokfmt.py @@ -5,11 +5,24 @@ from .types import Token # key: token type, value: (left spacing, right spacing) _want_spacing = { - "NUMBER": (2, 2), - "FLOAT_NUMBER": (2, 2), + "FLOAT_CONST": (2, 2), + "HEX_FLOAT_CONST": (2, 2), + "INT_CONST_HEX": (2, 2), + "INT_CONST_BIN": (2, 2), + "INT_CONST_OCT": (2, 2), + "INT_CONST_DEC": (2, 2), + "INT_CONST_CHAR": (2, 2), "NAME": (2, 2), - "CHAR_LITERAL": (2, 2), + "CHAR_CONST": (2, 2), + "WCHAR_CONST": (2, 2), + "U8CHAR_CONST": (2, 2), + "U16CHAR_CONST": (2, 2), + "U32CHAR_CONST": (2, 2), "STRING_LITERAL": (2, 2), + "WSTRING_LITERAL": (2, 2), + "U8STRING_LITERAL": (2, 2), + "U16STRING_LITERAL": (2, 2), + "U32STRING_LITERAL": (2, 2), "ELLIPSIS": (2, 2), ">": (0, 2), ")": (0, 1), From b54c80782427113120192861adffafa93df4552e Mon Sep 17 00:00:00 2001 From: Dustin Spicuzza Date: Tue, 13 Dec 2022 23:18:14 -0500 Subject: [PATCH 4/9] Remove unused lexer.filenames --- cxxheaderparser/lexer.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py index f8d3285..165f08e 100644 --- a/cxxheaderparser/lexer.py +++ b/cxxheaderparser/lexer.py @@ -432,11 +432,7 @@ class Lexer: def t_PRECOMP_MACRO(self, t: LexToken) -> typing.Optional[LexToken]: m = _line_re.match(t.value) if m: - filename = m.group(2) - if filename not in self._filenames_set: - self.filenames.append(filename) - self._filenames_set.add(filename) - self.filename = filename + self.filename = m.group(2) self.line_offset = 1 + self.lex.lineno - int(m.group(1)) return None @@ -508,13 +504,6 @@ class Lexer: self.filename = filename self.line_offset = 0 - self.filenames: typing.List[str] = [] - self._filenames_set: typing.Set[str] = set() - - if filename: - self.filenames.append(filename) - self._filenames_set.add(filename) - # Doxygen comments self.comments = [] From 079d643c67cae0e08ffed5cbca3b2280a2acc11a Mon Sep 17 00:00:00 2001 From: Dustin Spicuzza Date: Tue, 13 Dec 2022 23:53:26 -0500 Subject: [PATCH 5/9] Update PLY package to include version --- cxxheaderparser/_ply/__init__.py | 5 +++++ cxxheaderparser/_ply/lex.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/cxxheaderparser/_ply/__init__.py b/cxxheaderparser/_ply/__init__.py index e69de29..863ef54 100644 --- a/cxxheaderparser/_ply/__init__.py +++ b/cxxheaderparser/_ply/__init__.py @@ -0,0 +1,5 @@ +# PLY package +# Author: David Beazley (dave@dabeaz.com) +# https://github.com/dabeaz/ply + +__version__ = "2022.10.27" diff --git a/cxxheaderparser/_ply/lex.py b/cxxheaderparser/_ply/lex.py index 37a29b6..766a917 100644 --- a/cxxheaderparser/_ply/lex.py +++ b/cxxheaderparser/_ply/lex.py @@ -2,7 +2,7 @@ # ----------------------------------------------------------------------------- # ply: lex.py # -# Copyright (C) 2001-2020 +# Copyright (C) 2001-2022 # David M. Beazley (Dabeaz LLC) # All rights reserved. # From 2ba5c3c829f840842e7d20a0a54031dbba41e857 Mon Sep 17 00:00:00 2001 From: Dustin Spicuzza Date: Thu, 15 Dec 2022 00:50:08 -0500 Subject: [PATCH 6/9] Support extracting doxygen comments when declspec or attributes are present --- cxxheaderparser/parser.py | 11 ++++++++- tests/test_doxygen.py | 47 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/cxxheaderparser/parser.py b/cxxheaderparser/parser.py index 0be63e4..9a1e098 100644 --- a/cxxheaderparser/parser.py +++ b/cxxheaderparser/parser.py @@ -308,25 +308,34 @@ class CxxParser: ";": lambda _1, _2: None, } + _keep_doxygen = {"__declspec", "alignas", "__attribute__", "DBL_LBRACKET"} + tok = None get_token_eof_ok = self.lex.token_eof_ok get_doxygen = self.lex.get_doxygen + doxygen = None + try: while True: tok = get_token_eof_ok() if not tok: break - doxygen = get_doxygen() + if doxygen is None: + doxygen = get_doxygen() fn = _translation_unit_tokens.get(tok.type) if fn: fn(tok, doxygen) + + if tok.type not in _keep_doxygen: + doxygen = None else: # this processes ambiguous declarations self._parse_declarations(tok, doxygen) + doxygen = None except Exception as e: if self.verbose: diff --git a/tests/test_doxygen.py b/tests/test_doxygen.py index f82e615..88824b9 100644 --- a/tests/test_doxygen.py +++ b/tests/test_doxygen.py @@ -329,3 +329,50 @@ def test_doxygen_namespace() -> None: } ) ) + + +def test_doxygen_declspec() -> None: + content = """ + /// declspec comment + __declspec(thread) int i = 1; + """ + data = parse_string(content, cleandoc=True) + + assert data == ParsedData( + namespace=NamespaceScope( + variables=[ + Variable( + name=PQName(segments=[NameSpecifier(name="i")]), + type=Type( + typename=PQName(segments=[FundamentalSpecifier(name="int")]) + ), + value=Value(tokens=[Token(value="1")]), + doxygen="/// declspec comment", + ) + ] + ) + ) + + +def test_doxygen_attribute() -> None: + content = """ + /// hasattr comment + [[nodiscard]] + int hasattr(); + """ + data = parse_string(content, cleandoc=True) + + assert data == ParsedData( + namespace=NamespaceScope( + functions=[ + Function( + return_type=Type( + typename=PQName(segments=[FundamentalSpecifier(name="int")]) + ), + name=PQName(segments=[NameSpecifier(name="hasattr")]), + parameters=[], + doxygen="/// hasattr comment", + ) + ] + ) + ) From 40bf05b3844cfdc9e219bdd3e4e4b2e5b38c466a Mon Sep 17 00:00:00 2001 From: Dustin Spicuzza Date: Thu, 15 Dec 2022 00:50:54 -0500 Subject: [PATCH 7/9] Add additional doxygen related testcases to make sure things don't accidentally break --- tests/test_doxygen.py | 69 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 4 deletions(-) diff --git a/tests/test_doxygen.py b/tests/test_doxygen.py index 88824b9..dcc559d 100644 --- a/tests/test_doxygen.py +++ b/tests/test_doxygen.py @@ -209,16 +209,13 @@ def test_doxygen_fn_3slash() -> None: ) -def test_doxygen_fn_cstyle() -> None: +def test_doxygen_fn_cstyle1() -> None: content = """ - // clang-format off - /** * fn comment */ void fn(); - """ data = parse_string(content, cleandoc=True) @@ -238,6 +235,32 @@ def test_doxygen_fn_cstyle() -> None: ) +def test_doxygen_fn_cstyle2() -> None: + content = """ + /*! + * fn comment + */ + void + fn(); + """ + data = parse_string(content, cleandoc=True) + + assert data == ParsedData( + namespace=NamespaceScope( + functions=[ + Function( + return_type=Type( + typename=PQName(segments=[FundamentalSpecifier(name="void")]) + ), + name=PQName(segments=[NameSpecifier(name="fn")]), + parameters=[], + doxygen="/*!\n* fn comment\n*/", + ) + ] + ) + ) + + def test_doxygen_var_above() -> None: content = """ // clang-format off @@ -292,6 +315,44 @@ def test_doxygen_var_after() -> None: ) +def test_doxygen_multiple_variables() -> None: + content = """ + int x; /// this is x + int y; /// this is y + /// this is also y + int z; /// this is z + """ + data = parse_string(content, cleandoc=True) + + assert data == ParsedData( + namespace=NamespaceScope( + variables=[ + Variable( + name=PQName(segments=[NameSpecifier(name="x")]), + type=Type( + typename=PQName(segments=[FundamentalSpecifier(name="int")]) + ), + doxygen="/// this is x", + ), + Variable( + name=PQName(segments=[NameSpecifier(name="y")]), + type=Type( + typename=PQName(segments=[FundamentalSpecifier(name="int")]) + ), + doxygen="/// this is y\n/// this is also y", + ), + Variable( + name=PQName(segments=[NameSpecifier(name="z")]), + type=Type( + typename=PQName(segments=[FundamentalSpecifier(name="int")]) + ), + doxygen="/// this is z", + ), + ] + ) + ) + + def test_doxygen_namespace() -> None: content = """ /** From 1eaa85ae8d04fac47496bfe7f4cd6923418fc432 Mon Sep 17 00:00:00 2001 From: Dustin Spicuzza Date: Thu, 15 Dec 2022 02:38:44 -0500 Subject: [PATCH 8/9] Split the lexer into PlyLexer and TokenStream components - There are two types of token streams: file based, and list based - I think this has better component separation - Doxygen parsing is a bit weirder, but I think it's more straightforward to see all the pieces? --- cxxheaderparser/lexer.py | 368 ++++++++++++++++++++++---------------- cxxheaderparser/parser.py | 32 ++-- cxxheaderparser/tokfmt.py | 10 +- tests/test_tokfmt.py | 6 +- 4 files changed, 246 insertions(+), 170 deletions(-) diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py index 165f08e..096882a 100644 --- a/cxxheaderparser/lexer.py +++ b/cxxheaderparser/lexer.py @@ -50,7 +50,7 @@ class LexToken(Protocol): location: Location #: private - lexer: "Lexer" + lexer: lex.Lexer PhonyEnding: LexToken = lex.LexToken() # type: ignore @@ -60,10 +60,13 @@ PhonyEnding.lineno = 0 PhonyEnding.lexpos = 0 -class Lexer: +class PlyLexer: """ This lexer is a combination of pieces from the PLY lexers that CppHeaderParser and pycparser have. + + This tokenizes the input into tokens. The other lexer classes do more complex + things with the tokens. """ keywords = { @@ -439,13 +442,6 @@ class Lexer: else: return t - @TOKEN(r"\/\/.*\n?") - def t_COMMENT_SINGLELINE(self, t: LexToken) -> LexToken: - if t.value.startswith("///") or t.value.startswith("//!"): - self.comments.append(t.value.lstrip("\t ").rstrip("\n")) - t.lexer.lineno += t.value.count("\n") - return t - t_DIVIDE = r"/(?!/)" t_ELLIPSIS = r"\.\.\." t_DBL_LBRACKET = r"\[\[" @@ -458,22 +454,20 @@ class Lexer: t_STRING_LITERAL = string_literal + @TOKEN(r"\/\/.*\n?") + def t_COMMENT_SINGLELINE(self, t: LexToken) -> LexToken: + t.lexer.lineno += t.value.count("\n") + return t + # Found at http://ostermiller.org/findcomment.html @TOKEN(r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/\n?") def t_COMMENT_MULTILINE(self, t: LexToken) -> LexToken: - if t.value.startswith("/**") or t.value.startswith("/*!"): - # not sure why, but get double new lines - v = t.value.replace("\n\n", "\n") - # strip prefixing whitespace - v = _multicomment_re.sub("\n*", v) - self.comments = v.splitlines() t.lexer.lineno += t.value.count("\n") return t @TOKEN(r"\n+") def t_NEWLINE(self, t: LexToken) -> LexToken: t.lexer.lineno += len(t.value) - del self.comments[:] return t def t_error(self, t: LexToken) -> None: @@ -485,9 +479,8 @@ class Lexer: _lexer = None lex: lex.Lexer - lineno: int - def __new__(cls, *args, **kwargs) -> "Lexer": + def __new__(cls, *args, **kwargs) -> "PlyLexer": # only build the lexer once inst = super().__new__(cls) if cls._lexer is None: @@ -499,157 +492,75 @@ class Lexer: def __init__(self, filename: typing.Optional[str] = None): self.input: typing.Callable[[str], None] = self.lex.input + self.token: typing.Callable[[], LexToken] = self.lex.token # For tracking current file/line position self.filename = filename self.line_offset = 0 - # Doxygen comments - self.comments = [] + def current_location(self) -> Location: + return Location(self.filename, self.lex.lineno - self.line_offset) - self.lookahead = typing.Deque[LexToken]() - # For 'set_group_of_tokens' support - self._get_token: typing.Callable[[], LexToken] = self.lex.token - self.lookahead_stack = typing.Deque[typing.Deque[LexToken]]() +class TokenStream: + """ + Provides access to a stream of tokens + """ + + tokbuf: typing.Deque[LexToken] + + def _fill_tokbuf(self, tokbuf: typing.Deque[LexToken]) -> bool: + """ + Fills tokbuf with tokens from the next line. Return True if at least + one token was added to the buffer + """ + raise NotImplementedError def current_location(self) -> Location: - if self.lookahead: - return self.lookahead[0].location - return Location(self.filename, self.lex.lineno - self.line_offset) + raise NotImplementedError def get_doxygen(self) -> typing.Optional[str]: """ - This should be called after the first element of something has - been consumed. - - It will lookahead for comments that come after the item, if prior - comments don't exist. + This is called at the point that you want doxygen information """ + raise NotImplementedError - # Assumption: This function is either called at the beginning of a - # statement or at the end of a statement - - if self.comments: - comments = self.comments - else: - comments = [] - # only look for comments until a newline (including lookahead) - for tok in self.lookahead: - if tok.type == "NEWLINE": - return None - - while True: - tok = self._get_token() - comments.extend(self.comments) - - if tok is None: - break - - tok.location = Location(self.filename, tok.lineno - self.line_offset) - ttype = tok.type - if ttype == "NEWLINE": - self.lookahead.append(tok) - break - - if ttype not in self._discard_types: - self.lookahead.append(tok) - - if ttype == "NAME": - break - - del self.comments[:] - - comment_str = "\n".join(comments) - del self.comments[:] - if comment_str: - return comment_str - - return None + def get_doxygen_after(self) -> typing.Optional[str]: + """ + This is called to retrieve doxygen information after a statement + """ + raise NotImplementedError _discard_types = {"NEWLINE", "COMMENT_SINGLELINE", "COMMENT_MULTILINE"} - def _token_limit_exceeded(self) -> typing.NoReturn: - from .errors import CxxParseError - - raise CxxParseError("no more tokens left in this group") - - @contextlib.contextmanager - def set_group_of_tokens( - self, toks: typing.List[LexToken] - ) -> typing.Generator[typing.Deque[LexToken], None, None]: - # intended for use when you have a set of tokens that you know - # must be consumed, such as a paren grouping or some type of - # lookahead case - - stack = self.lookahead_stack - restore_fn = False - - if not stack: - restore_fn = True - self._get_token = self._token_limit_exceeded - - this_buf = typing.Deque[LexToken](toks) - prev_buf = self.lookahead - stack.append(prev_buf) - self.lookahead = this_buf - - try: - yield this_buf - finally: - buf = stack.pop() - if prev_buf is not buf: - raise ValueError("internal error") - - self.lookahead = prev_buf - - if restore_fn: - self._get_token = self.lex.token - def token(self) -> LexToken: - tok = None - while self.lookahead: - tok = self.lookahead.popleft() - if tok.type not in self._discard_types: - return tok - + tokbuf = self.tokbuf while True: - tok = self._get_token() - if tok is None: + while tokbuf: + tok = tokbuf.popleft() + if tok.type not in self._discard_types: + return tok + + if not self._fill_tokbuf(tokbuf): raise EOFError("unexpected end of file") - if tok.type not in self._discard_types: - tok.location = Location(self.filename, tok.lineno - self.line_offset) - break - - return tok - def token_eof_ok(self) -> typing.Optional[LexToken]: - tok = None - while self.lookahead: - tok = self.lookahead.popleft() - if tok.type not in self._discard_types: - return tok - + tokbuf = self.tokbuf while True: - tok = self._get_token() - if tok is None: - break + while tokbuf: + tok = tokbuf.popleft() + if tok.type not in self._discard_types: + return tok - if tok.type not in self._discard_types: - tok.location = Location(self.filename, tok.lineno - self.line_offset) - break - - return tok + if not self._fill_tokbuf(tokbuf): + return None def token_if(self, *types: str) -> typing.Optional[LexToken]: tok = self.token_eof_ok() if tok is None: return None if tok.type not in types: - # put it back on the left in case it was retrieved - # from the lookahead buffer - self.lookahead.appendleft(tok) + self.tokbuf.appendleft(tok) return None return tok @@ -658,9 +569,7 @@ class Lexer: if tok is None: return None if tok.type not in types: - # put it back on the left in case it was retrieved - # from the lookahead buffer - self.lookahead.appendleft(tok) + self.tokbuf.appendleft(tok) return None return tok @@ -669,9 +578,7 @@ class Lexer: if tok is None: return None if tok.value not in vals: - # put it back on the left in case it was retrieved - # from the lookahead buffer - self.lookahead.appendleft(tok) + self.tokbuf.appendleft(tok) return None return tok @@ -680,9 +587,7 @@ class Lexer: if tok is None: return None if tok.type in types: - # put it back on the left in case it was retrieved - # from the lookahead buffer - self.lookahead.appendleft(tok) + self.tokbuf.appendleft(tok) return None return tok @@ -690,18 +595,177 @@ class Lexer: tok = self.token_eof_ok() if not tok: return False - self.lookahead.appendleft(tok) + self.tokbuf.appendleft(tok) return tok.type in types def return_token(self, tok: LexToken) -> None: - self.lookahead.appendleft(tok) + self.tokbuf.appendleft(tok) def return_tokens(self, toks: typing.Sequence[LexToken]) -> None: - self.lookahead.extendleft(reversed(toks)) + self.tokbuf.extendleft(reversed(toks)) + + +class LexerTokenStream(TokenStream): + """ + Provides tokens from using PlyLexer on the given input text + """ + + def __init__(self, filename: typing.Optional[str], content: str) -> None: + self._lex = PlyLexer(filename) + self._lex.input(content) + self.tokbuf = typing.Deque[LexToken]() + + def _fill_tokbuf(self, tokbuf: typing.Deque[LexToken]) -> bool: + get_token = self._lex.token + tokbuf = self.tokbuf + + tok = get_token() + if tok is None: + return False + + while True: + tok.location = self._lex.current_location() + tokbuf.append(tok) + + if tok.type == "NEWLINE": + break + + tok = get_token() + if tok is None: + break + + return True + + def current_location(self) -> Location: + if self.tokbuf: + return self.tokbuf[0].location + return self._lex.current_location() + + def get_doxygen(self) -> typing.Optional[str]: + + tokbuf = self.tokbuf + + # fill the token buffer if it's empty (which indicates a newline) + if not tokbuf and not self._fill_tokbuf(tokbuf): + return None + + comments: typing.List[LexToken] = [] + + # retrieve any comments in the stream right before + # the first non-discard element + keep_going = True + while True: + while tokbuf: + tok = tokbuf.popleft() + if tok.type == "NEWLINE": + comments.clear() + elif tok.type in ("COMMENT_SINGLELINE", "COMMENT_MULTILINE"): + comments.append(tok) + else: + tokbuf.appendleft(tok) + keep_going = False + break + + if not keep_going: + break + + if not self._fill_tokbuf(tokbuf): + break + + if comments: + return self._extract_comments(comments) + + return None + + def get_doxygen_after(self) -> typing.Optional[str]: + tokbuf = self.tokbuf + + # if there's a newline directly after a statement, we're done + if not tokbuf: + return None + + # retrieve comments after non-discard elements + comments: typing.List[LexToken] = [] + new_tokbuf = typing.Deque[LexToken]() + + # This is different: we only extract tokens here + while tokbuf: + tok = tokbuf.popleft() + if tok.type == "NEWLINE": + break + elif tok.type in ("COMMENT_SINGLELINE", "COMMENT_MULTILINE"): + comments.append(tok) + else: + new_tokbuf.append(tok) + if comments: + break + + new_tokbuf.extend(tokbuf) + self.tokbuf = new_tokbuf + + if comments: + return self._extract_comments(comments) + + return None + + def _extract_comments(self, comments: typing.List[LexToken]): + # Now we have comments, need to extract the text from them + comment_lines: typing.List[str] = [] + for c in comments: + text = c.value + if c.type == "COMMENT_SINGLELINE": + if text.startswith("///") or text.startswith("//!"): + comment_lines.append(text.rstrip("\n")) + else: + if text.startswith("/**") or text.startswith("/*!"): + # not sure why, but get double new lines + text = text.replace("\n\n", "\n") + # strip prefixing whitespace + text = _multicomment_re.sub("\n*", text) + comment_lines = text.splitlines() + + comment_str = "\n".join(comment_lines) + if comment_str: + return comment_str + + return None + + +class BoundedTokenStream(TokenStream): + """ + Provides tokens from a fixed list of tokens. + + Intended for use when you have a group of tokens that you know + must be consumed, such as a paren grouping or some type of + lookahead case + """ + + def __init__(self, toks: typing.List[LexToken]) -> None: + self.tokbuf = typing.Deque[LexToken](toks) + + def has_tokens(self) -> bool: + return len(self.tokbuf) > 0 + + def _fill_tokbuf(self, tokbuf: typing.Deque[LexToken]) -> bool: + from .errors import CxxParseError + + raise CxxParseError("no more tokens left in this group") + + def current_location(self) -> Location: + if self.tokbuf: + return self.tokbuf[0].location + raise ValueError("internal error") + + def get_doxygen(self) -> typing.Optional[str]: + # comment tokens aren't going to be in this stream + return None + + def get_doxygen_after(self) -> typing.Optional[str]: + return None if __name__ == "__main__": # pragma: no cover try: - lex.runmain(lexer=Lexer(None)) + lex.runmain(lexer=PlyLexer(None)) except EOFError: pass diff --git a/cxxheaderparser/parser.py b/cxxheaderparser/parser.py index 9a1e098..6c6e764 100644 --- a/cxxheaderparser/parser.py +++ b/cxxheaderparser/parser.py @@ -4,8 +4,9 @@ import inspect import re import typing +from . import lexer from .errors import CxxParseError -from .lexer import Lexer, LexToken, Location, PhonyEnding +from .lexer import LexToken, Location, PhonyEnding from .options import ParserOptions from .parserstate import ( ClassBlockState, @@ -80,8 +81,7 @@ class CxxParser: self.visitor = visitor self.filename = filename - self.lex = Lexer(filename) - self.lex.input(content) + self.lex: lexer.TokenStream = lexer.LexerTokenStream(filename, content) global_ns = NamespaceDecl([], False) self.current_namespace = global_ns @@ -319,13 +319,13 @@ class CxxParser: try: while True: + if doxygen is None: + doxygen = get_doxygen() + tok = get_token_eof_ok() if not tok: break - if doxygen is None: - doxygen = get_doxygen() - fn = _translation_unit_tokens.get(tok.type) if fn: fn(tok, doxygen) @@ -619,7 +619,12 @@ class CxxParser: # append a token to make other parsing components happy raw_toks.append(PhonyEnding) - with self.lex.set_group_of_tokens(raw_toks) as remainder: + old_lex = self.lex + try: + # set up a temporary token stream with the tokens we need to parse + tmp_lex = lexer.BoundedTokenStream(raw_toks) + self.lex = tmp_lex + try: parsed_type, mods = self._parse_type(None) if parsed_type is None: @@ -631,9 +636,12 @@ class CxxParser: except CxxParseError: dtype = None else: - if remainder: + if tmp_lex.has_tokens(): dtype = None + finally: + self.lex = old_lex + if self.lex.token_if("ELLIPSIS"): param_pack = True @@ -948,12 +956,16 @@ class CxxParser: values: typing.List[Enumerator] = [] while True: + doxygen = self.lex.get_doxygen() + name_tok = self._next_token_must_be("}", "NAME") if name_tok.value == "}": break + if doxygen is None: + doxygen = self.lex.get_doxygen_after() + name = name_tok.value - doxygen = self.lex.get_doxygen() value = None tok = self._next_token_must_be("}", ",", "=", "DBL_LBRACKET") @@ -1253,7 +1265,7 @@ class CxxParser: if doxygen is None: # try checking after the var - doxygen = self.lex.get_doxygen() + doxygen = self.lex.get_doxygen_after() if is_typedef: if not name: diff --git a/cxxheaderparser/tokfmt.py b/cxxheaderparser/tokfmt.py index 296c3d2..f2bb67c 100644 --- a/cxxheaderparser/tokfmt.py +++ b/cxxheaderparser/tokfmt.py @@ -1,6 +1,6 @@ import typing -from .lexer import LexToken, Lexer +from .lexer import LexToken, PlyLexer, LexerTokenStream from .types import Token # key: token type, value: (left spacing, right spacing) @@ -32,7 +32,7 @@ _want_spacing = { "&": (0, 2), } -_want_spacing.update(dict.fromkeys(Lexer.keywords, (2, 2))) +_want_spacing.update(dict.fromkeys(PlyLexer.keywords, (2, 2))) def tokfmt(toks: typing.List[Token]) -> str: @@ -67,9 +67,9 @@ if __name__ == "__main__": # pragma: no cover parser.add_argument("header") args = parser.parse_args() - lexer = Lexer(args.header) - with open(lexer.filename) as fp: - lexer.input(fp.read()) # type: ignore + filename: str = args.header + with open(filename) as fp: + lexer = LexerTokenStream(filename, fp.read()) toks: typing.List[Token] = [] while True: diff --git a/tests/test_tokfmt.py b/tests/test_tokfmt.py index ba245c3..758b9f6 100644 --- a/tests/test_tokfmt.py +++ b/tests/test_tokfmt.py @@ -1,6 +1,6 @@ import pytest -from cxxheaderparser.lexer import Lexer +from cxxheaderparser.lexer import PlyLexer from cxxheaderparser.tokfmt import tokfmt from cxxheaderparser.types import Token @@ -40,11 +40,11 @@ def test_tokfmt(instr: str) -> None: Each input string is exactly what the output of tokfmt should be """ toks = [] - lexer = Lexer("") + lexer = PlyLexer("") lexer.input(instr) while True: - tok = lexer.token_eof_ok() + tok = lexer.token() if not tok: break From e5295070a036e628f42a25c6d453f21733bf3736 Mon Sep 17 00:00:00 2001 From: Dustin Spicuzza Date: Thu, 15 Dec 2022 02:55:07 -0500 Subject: [PATCH 9/9] Add support for parsing user defined literals --- cxxheaderparser/lexer.py | 51 ++++++++++++++++++++++++++++++++++++++-- tests/test_misc.py | 31 ++++++++++++++++++++++++ tests/test_tokfmt.py | 5 ++-- 3 files changed, 83 insertions(+), 4 deletions(-) diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py index 096882a..075e2cf 100644 --- a/cxxheaderparser/lexer.py +++ b/cxxheaderparser/lexer.py @@ -179,6 +179,7 @@ class PlyLexer: # misc "DIVIDE", "NEWLINE", + "WHITESPACE", "ELLIPSIS", "DBL_LBRACKET", "DBL_RBRACKET", @@ -329,7 +330,8 @@ class PlyLexer: + "[FfLl]?)" ) - t_ignore = " \t\r?@\f" + t_WHITESPACE = "[ \t]+" + t_ignore = "\r" # The following floating and integer constants are defined as # functions to impose a strict order (otherwise, decimal @@ -531,7 +533,12 @@ class TokenStream: """ raise NotImplementedError - _discard_types = {"NEWLINE", "COMMENT_SINGLELINE", "COMMENT_MULTILINE"} + _discard_types = { + "NEWLINE", + "COMMENT_SINGLELINE", + "COMMENT_MULTILINE", + "WHITESPACE", + } def token(self) -> LexToken: tokbuf = self.tokbuf @@ -610,6 +617,27 @@ class LexerTokenStream(TokenStream): Provides tokens from using PlyLexer on the given input text """ + _user_defined_literal_start = { + "FLOAT_CONST", + "HEX_FLOAT_CONST", + "INT_CONST_HEX", + "INT_CONST_BIN", + "INT_CONST_OCT", + "INT_CONST_DEC", + "INT_CONST_CHAR", + "CHAR_CONST", + "WCHAR_CONST", + "U8CHAR_CONST", + "U16CHAR_CONST", + "U32CHAR_CONST", + # String literals + "STRING_LITERAL", + "WSTRING_LITERAL", + "U8STRING_LITERAL", + "U16STRING_LITERAL", + "U32STRING_LITERAL", + } + def __init__(self, filename: typing.Optional[str], content: str) -> None: self._lex = PlyLexer(filename) self._lex.input(content) @@ -623,6 +651,8 @@ class LexerTokenStream(TokenStream): if tok is None: return False + udl_start = self._user_defined_literal_start + while True: tok.location = self._lex.current_location() tokbuf.append(tok) @@ -630,6 +660,19 @@ class LexerTokenStream(TokenStream): if tok.type == "NEWLINE": break + # detect/combine user defined literals + if tok.type in udl_start: + tok2 = get_token() + if tok2 is None: + break + + if tok2.type != "NAME" or tok2.value[0] != "_": + tok = tok2 + continue + + tok.value = tok.value + tok2.value + tok.type = f"UD_{tok.type}" + tok = get_token() if tok is None: break @@ -659,6 +702,8 @@ class LexerTokenStream(TokenStream): tok = tokbuf.popleft() if tok.type == "NEWLINE": comments.clear() + elif tok.type == "WHITESPACE": + pass elif tok.type in ("COMMENT_SINGLELINE", "COMMENT_MULTILINE"): comments.append(tok) else: @@ -693,6 +738,8 @@ class LexerTokenStream(TokenStream): tok = tokbuf.popleft() if tok.type == "NEWLINE": break + elif tok.type == "WHITESPACE": + new_tokbuf.append(tok) elif tok.type in ("COMMENT_SINGLELINE", "COMMENT_MULTILINE"): comments.append(tok) else: diff --git a/tests/test_misc.py b/tests/test_misc.py index b90fd96..81d4a0e 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -236,3 +236,34 @@ def test_final() -> None: ], ) ) + + +# +# User defined literals +# + + +def test_user_defined_literal() -> None: + content = """ + units::volt_t v = 1_V; + """ + data = parse_string(content, cleandoc=True) + + assert data == ParsedData( + namespace=NamespaceScope( + variables=[ + Variable( + name=PQName(segments=[NameSpecifier(name="v")]), + type=Type( + typename=PQName( + segments=[ + NameSpecifier(name="units"), + NameSpecifier(name="volt_t"), + ] + ) + ), + value=Value(tokens=[Token(value="1_V")]), + ) + ] + ) + ) diff --git a/tests/test_tokfmt.py b/tests/test_tokfmt.py index 758b9f6..cc0b379 100644 --- a/tests/test_tokfmt.py +++ b/tests/test_tokfmt.py @@ -1,6 +1,6 @@ import pytest -from cxxheaderparser.lexer import PlyLexer +from cxxheaderparser.lexer import PlyLexer, LexerTokenStream from cxxheaderparser.tokfmt import tokfmt from cxxheaderparser.types import Token @@ -48,6 +48,7 @@ def test_tokfmt(instr: str) -> None: if not tok: break - toks.append(Token(tok.value, tok.type)) + if tok.type not in LexerTokenStream._discard_types: + toks.append(Token(tok.value, tok.type)) assert tokfmt(toks) == instr