From aee776072eac004dc8a08d53e95a62090544dcd9 Mon Sep 17 00:00:00 2001 From: Dustin Spicuzza Date: Sat, 10 Dec 2022 15:16:05 -0500 Subject: [PATCH] Grab string/character lexer constants from pycparser --- LICENSE.txt | 30 ++++- cxxheaderparser/lexer.py | 246 ++++++++++++++++++++++++++++++++++++-- cxxheaderparser/parser.py | 2 +- cxxheaderparser/tokfmt.py | 19 ++- 4 files changed, 282 insertions(+), 15 deletions(-) diff --git a/LICENSE.txt b/LICENSE.txt index 1d8f05b..5d5e3a0 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ cxxheaderparser license: -Copyright (c) 2020 Dustin Spicuzza +Copyright (c) 2020-2022 Dustin Spicuzza All rights reserved. Redistribution and use in source and binary forms, with or without @@ -102,3 +102,31 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------------- + +pycparser -- A C parser in Python + +Copyright (c) 2008-2022, Eli Bendersky +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py index 36c6c34..f8d3285 100644 --- a/cxxheaderparser/lexer.py +++ b/cxxheaderparser/lexer.py @@ -61,6 +61,10 @@ PhonyEnding.lexpos = 0 class Lexer: + """ + This lexer is a combination of pieces from the PLY lexers that CppHeaderParser + and pycparser have. + """ keywords = { "__attribute__", @@ -144,15 +148,33 @@ class Lexer: } tokens = [ - "NUMBER", - "FLOAT_NUMBER", + # constants + "FLOAT_CONST", + "HEX_FLOAT_CONST", + "INT_CONST_HEX", + "INT_CONST_BIN", + "INT_CONST_OCT", + "INT_CONST_DEC", + "INT_CONST_CHAR", + "CHAR_CONST", + "WCHAR_CONST", + "U8CHAR_CONST", + "U16CHAR_CONST", + "U32CHAR_CONST", + # String literals + "STRING_LITERAL", + "WSTRING_LITERAL", + "U8STRING_LITERAL", + "U16STRING_LITERAL", + "U32STRING_LITERAL", + # "NAME", + # Comments "COMMENT_SINGLELINE", "COMMENT_MULTILINE", "PRECOMP_MACRO", + # misc "DIVIDE", - "CHAR_LITERAL", - "STRING_LITERAL", "NEWLINE", "ELLIPSIS", "DBL_LBRACKET", @@ -189,9 +211,216 @@ class Lexer: ".", ] + # + # Regexes for use in tokens (taken from pycparser) + # + + hex_prefix = "0[xX]" + hex_digits = "[0-9a-fA-F]+" + bin_prefix = "0[bB]" + bin_digits = "[01]+" + + # integer constants (K&R2: A.2.5.1) + integer_suffix_opt = ( + r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?" + ) + decimal_constant = ( + "(0" + integer_suffix_opt + ")|([1-9][0-9]*" + integer_suffix_opt + ")" + ) + octal_constant = "0[0-7]*" + integer_suffix_opt + hex_constant = hex_prefix + hex_digits + integer_suffix_opt + bin_constant = bin_prefix + bin_digits + integer_suffix_opt + + bad_octal_constant = "0[0-7]*[89]" + + # character constants (K&R2: A.2.5.2) + # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line + # directives with Windows paths as filenames (..\..\dir\file) + # For the same reason, decimal_escape allows all digit sequences. We want to + # parse all correct code, even if it means to sometimes parse incorrect + # code. + # + # The original regexes were taken verbatim from the C syntax definition, + # and were later modified to avoid worst-case exponential running time. + # + # simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" + # decimal_escape = r"""(\d+)""" + # hex_escape = r"""(x[0-9a-fA-F]+)""" + # bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" + # + # The following modifications were made to avoid the ambiguity that allowed backtracking: + # (https://github.com/eliben/pycparser/issues/61) + # + # - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape. + # - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex + # - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal + # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape. + # + # Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways. + # e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`. + + simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))""" + decimal_escape = r"""(\d+)(?!\d)""" + hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])""" + bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])""" + + escape_sequence = ( + r"""(\\(""" + simple_escape + "|" + decimal_escape + "|" + hex_escape + "))" + ) + + # This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed + # 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to + + escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])""" + + cconst_char = r"""([^'\\\n]|""" + escape_sequence + ")" + char_const = "'" + cconst_char + "'" + wchar_const = "L" + char_const + u8char_const = "u8" + char_const + u16char_const = "u" + char_const + u32char_const = "U" + char_const + multicharacter_constant = "'" + cconst_char + "{2,4}'" + unmatched_quote = "('" + cconst_char + "*\\n)|('" + cconst_char + "*$)" + bad_char_const = ( + r"""('""" + + cconst_char + + """[^'\n]+')|('')|('""" + + bad_escape + + r"""[^'\n]*')""" + ) + + # string literals (K&R2: A.2.6) + string_char = r"""([^"\\\n]|""" + escape_sequence_start_in_string + ")" + string_literal = '"' + string_char + '*"' + wstring_literal = "L" + string_literal + u8string_literal = "u8" + string_literal + u16string_literal = "u" + string_literal + u32string_literal = "U" + string_literal + bad_string_literal = '"' + string_char + "*" + bad_escape + string_char + '*"' + + # floating constants (K&R2: A.2.5.3) + exponent_part = r"""([eE][-+]?[0-9]+)""" + fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" + floating_constant = ( + "((((" + + fractional_constant + + ")" + + exponent_part + + "?)|([0-9]+" + + exponent_part + + "))[FfLl]?)" + ) + binary_exponent_part = r"""([pP][+-]?[0-9]+)""" + hex_fractional_constant = ( + "(((" + hex_digits + r""")?\.""" + hex_digits + ")|(" + hex_digits + r"""\.))""" + ) + hex_floating_constant = ( + "(" + + hex_prefix + + "(" + + hex_digits + + "|" + + hex_fractional_constant + + ")" + + binary_exponent_part + + "[FfLl]?)" + ) + t_ignore = " \t\r?@\f" - t_NUMBER = r"[0-9][0-9XxA-Fa-f]*" - t_FLOAT_NUMBER = r"[-+]?[0-9]*\.[0-9]+([eE][-+]?[0-9]+)?" + + # The following floating and integer constants are defined as + # functions to impose a strict order (otherwise, decimal + # is placed before the others because its regex is longer, + # and this is bad) + # + @TOKEN(floating_constant) + def t_FLOAT_CONST(self, t: LexToken) -> LexToken: + return t + + @TOKEN(hex_floating_constant) + def t_HEX_FLOAT_CONST(self, t: LexToken) -> LexToken: + return t + + @TOKEN(hex_constant) + def t_INT_CONST_HEX(self, t: LexToken) -> LexToken: + return t + + @TOKEN(bin_constant) + def t_INT_CONST_BIN(self, t: LexToken) -> LexToken: + return t + + @TOKEN(bad_octal_constant) + def t_BAD_CONST_OCT(self, t: LexToken) -> None: + msg = "Invalid octal constant" + self._error(msg, t) + + @TOKEN(octal_constant) + def t_INT_CONST_OCT(self, t: LexToken) -> LexToken: + return t + + @TOKEN(decimal_constant) + def t_INT_CONST_DEC(self, t: LexToken) -> LexToken: + return t + + # Must come before bad_char_const, to prevent it from + # catching valid char constants as invalid + # + @TOKEN(multicharacter_constant) + def t_INT_CONST_CHAR(self, t: LexToken) -> LexToken: + return t + + @TOKEN(char_const) + def t_CHAR_CONST(self, t: LexToken) -> LexToken: + return t + + @TOKEN(wchar_const) + def t_WCHAR_CONST(self, t: LexToken) -> LexToken: + return t + + @TOKEN(u8char_const) + def t_U8CHAR_CONST(self, t: LexToken) -> LexToken: + return t + + @TOKEN(u16char_const) + def t_U16CHAR_CONST(self, t: LexToken) -> LexToken: + return t + + @TOKEN(u32char_const) + def t_U32CHAR_CONST(self, t: LexToken) -> LexToken: + return t + + @TOKEN(unmatched_quote) + def t_UNMATCHED_QUOTE(self, t: LexToken) -> None: + msg = "Unmatched '" + self._error(msg, t) + + @TOKEN(bad_char_const) + def t_BAD_CHAR_CONST(self, t: LexToken) -> None: + msg = "Invalid char constant %s" % t.value + self._error(msg, t) + + @TOKEN(wstring_literal) + def t_WSTRING_LITERAL(self, t: LexToken) -> LexToken: + return t + + @TOKEN(u8string_literal) + def t_U8STRING_LITERAL(self, t: LexToken) -> LexToken: + return t + + @TOKEN(u16string_literal) + def t_U16STRING_LITERAL(self, t: LexToken) -> LexToken: + return t + + @TOKEN(u32string_literal) + def t_U32STRING_LITERAL(self, t: LexToken) -> LexToken: + return t + + # unmatched string literals are caught by the preprocessor + + @TOKEN(bad_string_literal) + def t_BAD_STRING_LITERAL(self, t): + msg = "String contains invalid escape code" + self._error(msg, t) @TOKEN(r"[A-Za-z_~][A-Za-z0-9_]*") def t_NAME(self, t: LexToken) -> LexToken: @@ -222,7 +451,6 @@ class Lexer: return t t_DIVIDE = r"/(?!/)" - t_CHAR_LITERAL = "'.'" t_ELLIPSIS = r"\.\.\." t_DBL_LBRACKET = r"\[\[" t_DBL_RBRACKET = r"\]\]" @@ -232,9 +460,7 @@ class Lexer: t_SHIFT_LEFT = r"<<" # SHIFT_RIGHT introduces ambiguity - # found at http://wordaligned.org/articles/string-literals-and-regular-expressions - # TODO: This does not work with the string "bla \" bla" - t_STRING_LITERAL = r'"([^"\\]|\\.)*"' + t_STRING_LITERAL = string_literal # Found at http://ostermiller.org/findcomment.html @TOKEN(r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/\n?") diff --git a/cxxheaderparser/parser.py b/cxxheaderparser/parser.py index 67d5a37..0be63e4 100644 --- a/cxxheaderparser/parser.py +++ b/cxxheaderparser/parser.py @@ -1171,7 +1171,7 @@ class CxxParser: def _parse_bitfield(self) -> int: # is a integral constant expression... for now, just do integers - tok = self._next_token_must_be("NUMBER") + tok = self._next_token_must_be("INT_CONST_DEC") return int(tok.value) def _parse_field( diff --git a/cxxheaderparser/tokfmt.py b/cxxheaderparser/tokfmt.py index 3fa1bf2..296c3d2 100644 --- a/cxxheaderparser/tokfmt.py +++ b/cxxheaderparser/tokfmt.py @@ -5,11 +5,24 @@ from .types import Token # key: token type, value: (left spacing, right spacing) _want_spacing = { - "NUMBER": (2, 2), - "FLOAT_NUMBER": (2, 2), + "FLOAT_CONST": (2, 2), + "HEX_FLOAT_CONST": (2, 2), + "INT_CONST_HEX": (2, 2), + "INT_CONST_BIN": (2, 2), + "INT_CONST_OCT": (2, 2), + "INT_CONST_DEC": (2, 2), + "INT_CONST_CHAR": (2, 2), "NAME": (2, 2), - "CHAR_LITERAL": (2, 2), + "CHAR_CONST": (2, 2), + "WCHAR_CONST": (2, 2), + "U8CHAR_CONST": (2, 2), + "U16CHAR_CONST": (2, 2), + "U32CHAR_CONST": (2, 2), "STRING_LITERAL": (2, 2), + "WSTRING_LITERAL": (2, 2), + "U8STRING_LITERAL": (2, 2), + "U16STRING_LITERAL": (2, 2), + "U32STRING_LITERAL": (2, 2), "ELLIPSIS": (2, 2), ">": (0, 2), ")": (0, 1),