Merge pull request #42 from robotpy/lexer-updates
Significant lexer overhaul
This commit is contained in:
commit
296272fd39
30
LICENSE.txt
30
LICENSE.txt
@ -1,6 +1,6 @@
|
|||||||
cxxheaderparser license:
|
cxxheaderparser license:
|
||||||
|
|
||||||
Copyright (c) 2020 Dustin Spicuzza <dustin@virtualroadside.com>
|
Copyright (c) 2020-2022 Dustin Spicuzza <dustin@virtualroadside.com>
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
@ -102,3 +102,31 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|||||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
pycparser -- A C parser in Python
|
||||||
|
|
||||||
|
Copyright (c) 2008-2022, Eli Bendersky
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice, this
|
||||||
|
list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer in the documentation
|
||||||
|
and/or other materials provided with the distribution.
|
||||||
|
* Neither the name of the copyright holder nor the names of its contributors may
|
||||||
|
be used to endorse or promote products derived from this software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||||
|
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
@ -0,0 +1,5 @@
|
|||||||
|
# PLY package
|
||||||
|
# Author: David Beazley (dave@dabeaz.com)
|
||||||
|
# https://github.com/dabeaz/ply
|
||||||
|
|
||||||
|
__version__ = "2022.10.27"
|
@ -2,7 +2,7 @@
|
|||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# ply: lex.py
|
# ply: lex.py
|
||||||
#
|
#
|
||||||
# Copyright (C) 2001-2020
|
# Copyright (C) 2001-2022
|
||||||
# David M. Beazley (Dabeaz LLC)
|
# David M. Beazley (Dabeaz LLC)
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import typing
|
import typing
|
||||||
|
|
||||||
|
if typing.TYPE_CHECKING:
|
||||||
from .lexer import LexToken
|
from .lexer import LexToken
|
||||||
|
|
||||||
|
|
||||||
|
@ -5,6 +5,13 @@ import typing
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
from ._ply import lex
|
from ._ply import lex
|
||||||
|
from ._ply.lex import TOKEN
|
||||||
|
|
||||||
|
from .errors import CxxParseError
|
||||||
|
|
||||||
|
|
||||||
|
class LexError(CxxParseError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
if sys.version_info >= (3, 8):
|
if sys.version_info >= (3, 8):
|
||||||
@ -43,7 +50,7 @@ class LexToken(Protocol):
|
|||||||
location: Location
|
location: Location
|
||||||
|
|
||||||
#: private
|
#: private
|
||||||
lexer: "Lexer"
|
lexer: lex.Lexer
|
||||||
|
|
||||||
|
|
||||||
PhonyEnding: LexToken = lex.LexToken() # type: ignore
|
PhonyEnding: LexToken = lex.LexToken() # type: ignore
|
||||||
@ -53,7 +60,14 @@ PhonyEnding.lineno = 0
|
|||||||
PhonyEnding.lexpos = 0
|
PhonyEnding.lexpos = 0
|
||||||
|
|
||||||
|
|
||||||
class Lexer:
|
class PlyLexer:
|
||||||
|
"""
|
||||||
|
This lexer is a combination of pieces from the PLY lexers that CppHeaderParser
|
||||||
|
and pycparser have.
|
||||||
|
|
||||||
|
This tokenizes the input into tokens. The other lexer classes do more complex
|
||||||
|
things with the tokens.
|
||||||
|
"""
|
||||||
|
|
||||||
keywords = {
|
keywords = {
|
||||||
"__attribute__",
|
"__attribute__",
|
||||||
@ -137,16 +151,35 @@ class Lexer:
|
|||||||
}
|
}
|
||||||
|
|
||||||
tokens = [
|
tokens = [
|
||||||
"NUMBER",
|
# constants
|
||||||
"FLOAT_NUMBER",
|
"FLOAT_CONST",
|
||||||
|
"HEX_FLOAT_CONST",
|
||||||
|
"INT_CONST_HEX",
|
||||||
|
"INT_CONST_BIN",
|
||||||
|
"INT_CONST_OCT",
|
||||||
|
"INT_CONST_DEC",
|
||||||
|
"INT_CONST_CHAR",
|
||||||
|
"CHAR_CONST",
|
||||||
|
"WCHAR_CONST",
|
||||||
|
"U8CHAR_CONST",
|
||||||
|
"U16CHAR_CONST",
|
||||||
|
"U32CHAR_CONST",
|
||||||
|
# String literals
|
||||||
|
"STRING_LITERAL",
|
||||||
|
"WSTRING_LITERAL",
|
||||||
|
"U8STRING_LITERAL",
|
||||||
|
"U16STRING_LITERAL",
|
||||||
|
"U32STRING_LITERAL",
|
||||||
|
#
|
||||||
"NAME",
|
"NAME",
|
||||||
|
# Comments
|
||||||
"COMMENT_SINGLELINE",
|
"COMMENT_SINGLELINE",
|
||||||
"COMMENT_MULTILINE",
|
"COMMENT_MULTILINE",
|
||||||
"PRECOMP_MACRO",
|
"PRECOMP_MACRO",
|
||||||
|
# misc
|
||||||
"DIVIDE",
|
"DIVIDE",
|
||||||
"CHAR_LITERAL",
|
|
||||||
"STRING_LITERAL",
|
|
||||||
"NEWLINE",
|
"NEWLINE",
|
||||||
|
"WHITESPACE",
|
||||||
"ELLIPSIS",
|
"ELLIPSIS",
|
||||||
"DBL_LBRACKET",
|
"DBL_LBRACKET",
|
||||||
"DBL_RBRACKET",
|
"DBL_RBRACKET",
|
||||||
@ -182,40 +215,236 @@ class Lexer:
|
|||||||
".",
|
".",
|
||||||
]
|
]
|
||||||
|
|
||||||
t_ignore = " \t\r?@\f"
|
#
|
||||||
t_NUMBER = r"[0-9][0-9XxA-Fa-f]*"
|
# Regexes for use in tokens (taken from pycparser)
|
||||||
t_FLOAT_NUMBER = r"[-+]?[0-9]*\.[0-9]+([eE][-+]?[0-9]+)?"
|
#
|
||||||
|
|
||||||
|
hex_prefix = "0[xX]"
|
||||||
|
hex_digits = "[0-9a-fA-F]+"
|
||||||
|
bin_prefix = "0[bB]"
|
||||||
|
bin_digits = "[01]+"
|
||||||
|
|
||||||
|
# integer constants (K&R2: A.2.5.1)
|
||||||
|
integer_suffix_opt = (
|
||||||
|
r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?"
|
||||||
|
)
|
||||||
|
decimal_constant = (
|
||||||
|
"(0" + integer_suffix_opt + ")|([1-9][0-9]*" + integer_suffix_opt + ")"
|
||||||
|
)
|
||||||
|
octal_constant = "0[0-7]*" + integer_suffix_opt
|
||||||
|
hex_constant = hex_prefix + hex_digits + integer_suffix_opt
|
||||||
|
bin_constant = bin_prefix + bin_digits + integer_suffix_opt
|
||||||
|
|
||||||
|
bad_octal_constant = "0[0-7]*[89]"
|
||||||
|
|
||||||
|
# character constants (K&R2: A.2.5.2)
|
||||||
|
# Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
|
||||||
|
# directives with Windows paths as filenames (..\..\dir\file)
|
||||||
|
# For the same reason, decimal_escape allows all digit sequences. We want to
|
||||||
|
# parse all correct code, even if it means to sometimes parse incorrect
|
||||||
|
# code.
|
||||||
|
#
|
||||||
|
# The original regexes were taken verbatim from the C syntax definition,
|
||||||
|
# and were later modified to avoid worst-case exponential running time.
|
||||||
|
#
|
||||||
|
# simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
|
||||||
|
# decimal_escape = r"""(\d+)"""
|
||||||
|
# hex_escape = r"""(x[0-9a-fA-F]+)"""
|
||||||
|
# bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
|
||||||
|
#
|
||||||
|
# The following modifications were made to avoid the ambiguity that allowed backtracking:
|
||||||
|
# (https://github.com/eliben/pycparser/issues/61)
|
||||||
|
#
|
||||||
|
# - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape.
|
||||||
|
# - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex
|
||||||
|
# - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal
|
||||||
|
# - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape.
|
||||||
|
#
|
||||||
|
# Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways.
|
||||||
|
# e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`.
|
||||||
|
|
||||||
|
simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
|
||||||
|
decimal_escape = r"""(\d+)(?!\d)"""
|
||||||
|
hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
|
||||||
|
bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""
|
||||||
|
|
||||||
|
escape_sequence = (
|
||||||
|
r"""(\\(""" + simple_escape + "|" + decimal_escape + "|" + hex_escape + "))"
|
||||||
|
)
|
||||||
|
|
||||||
|
# This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed
|
||||||
|
# 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to
|
||||||
|
|
||||||
|
escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""
|
||||||
|
|
||||||
|
cconst_char = r"""([^'\\\n]|""" + escape_sequence + ")"
|
||||||
|
char_const = "'" + cconst_char + "'"
|
||||||
|
wchar_const = "L" + char_const
|
||||||
|
u8char_const = "u8" + char_const
|
||||||
|
u16char_const = "u" + char_const
|
||||||
|
u32char_const = "U" + char_const
|
||||||
|
multicharacter_constant = "'" + cconst_char + "{2,4}'"
|
||||||
|
unmatched_quote = "('" + cconst_char + "*\\n)|('" + cconst_char + "*$)"
|
||||||
|
bad_char_const = (
|
||||||
|
r"""('"""
|
||||||
|
+ cconst_char
|
||||||
|
+ """[^'\n]+')|('')|('"""
|
||||||
|
+ bad_escape
|
||||||
|
+ r"""[^'\n]*')"""
|
||||||
|
)
|
||||||
|
|
||||||
|
# string literals (K&R2: A.2.6)
|
||||||
|
string_char = r"""([^"\\\n]|""" + escape_sequence_start_in_string + ")"
|
||||||
|
string_literal = '"' + string_char + '*"'
|
||||||
|
wstring_literal = "L" + string_literal
|
||||||
|
u8string_literal = "u8" + string_literal
|
||||||
|
u16string_literal = "u" + string_literal
|
||||||
|
u32string_literal = "U" + string_literal
|
||||||
|
bad_string_literal = '"' + string_char + "*" + bad_escape + string_char + '*"'
|
||||||
|
|
||||||
|
# floating constants (K&R2: A.2.5.3)
|
||||||
|
exponent_part = r"""([eE][-+]?[0-9]+)"""
|
||||||
|
fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
|
||||||
|
floating_constant = (
|
||||||
|
"(((("
|
||||||
|
+ fractional_constant
|
||||||
|
+ ")"
|
||||||
|
+ exponent_part
|
||||||
|
+ "?)|([0-9]+"
|
||||||
|
+ exponent_part
|
||||||
|
+ "))[FfLl]?)"
|
||||||
|
)
|
||||||
|
binary_exponent_part = r"""([pP][+-]?[0-9]+)"""
|
||||||
|
hex_fractional_constant = (
|
||||||
|
"(((" + hex_digits + r""")?\.""" + hex_digits + ")|(" + hex_digits + r"""\.))"""
|
||||||
|
)
|
||||||
|
hex_floating_constant = (
|
||||||
|
"("
|
||||||
|
+ hex_prefix
|
||||||
|
+ "("
|
||||||
|
+ hex_digits
|
||||||
|
+ "|"
|
||||||
|
+ hex_fractional_constant
|
||||||
|
+ ")"
|
||||||
|
+ binary_exponent_part
|
||||||
|
+ "[FfLl]?)"
|
||||||
|
)
|
||||||
|
|
||||||
|
t_WHITESPACE = "[ \t]+"
|
||||||
|
t_ignore = "\r"
|
||||||
|
|
||||||
|
# The following floating and integer constants are defined as
|
||||||
|
# functions to impose a strict order (otherwise, decimal
|
||||||
|
# is placed before the others because its regex is longer,
|
||||||
|
# and this is bad)
|
||||||
|
#
|
||||||
|
@TOKEN(floating_constant)
|
||||||
|
def t_FLOAT_CONST(self, t: LexToken) -> LexToken:
|
||||||
|
return t
|
||||||
|
|
||||||
|
@TOKEN(hex_floating_constant)
|
||||||
|
def t_HEX_FLOAT_CONST(self, t: LexToken) -> LexToken:
|
||||||
|
return t
|
||||||
|
|
||||||
|
@TOKEN(hex_constant)
|
||||||
|
def t_INT_CONST_HEX(self, t: LexToken) -> LexToken:
|
||||||
|
return t
|
||||||
|
|
||||||
|
@TOKEN(bin_constant)
|
||||||
|
def t_INT_CONST_BIN(self, t: LexToken) -> LexToken:
|
||||||
|
return t
|
||||||
|
|
||||||
|
@TOKEN(bad_octal_constant)
|
||||||
|
def t_BAD_CONST_OCT(self, t: LexToken) -> None:
|
||||||
|
msg = "Invalid octal constant"
|
||||||
|
self._error(msg, t)
|
||||||
|
|
||||||
|
@TOKEN(octal_constant)
|
||||||
|
def t_INT_CONST_OCT(self, t: LexToken) -> LexToken:
|
||||||
|
return t
|
||||||
|
|
||||||
|
@TOKEN(decimal_constant)
|
||||||
|
def t_INT_CONST_DEC(self, t: LexToken) -> LexToken:
|
||||||
|
return t
|
||||||
|
|
||||||
|
# Must come before bad_char_const, to prevent it from
|
||||||
|
# catching valid char constants as invalid
|
||||||
|
#
|
||||||
|
@TOKEN(multicharacter_constant)
|
||||||
|
def t_INT_CONST_CHAR(self, t: LexToken) -> LexToken:
|
||||||
|
return t
|
||||||
|
|
||||||
|
@TOKEN(char_const)
|
||||||
|
def t_CHAR_CONST(self, t: LexToken) -> LexToken:
|
||||||
|
return t
|
||||||
|
|
||||||
|
@TOKEN(wchar_const)
|
||||||
|
def t_WCHAR_CONST(self, t: LexToken) -> LexToken:
|
||||||
|
return t
|
||||||
|
|
||||||
|
@TOKEN(u8char_const)
|
||||||
|
def t_U8CHAR_CONST(self, t: LexToken) -> LexToken:
|
||||||
|
return t
|
||||||
|
|
||||||
|
@TOKEN(u16char_const)
|
||||||
|
def t_U16CHAR_CONST(self, t: LexToken) -> LexToken:
|
||||||
|
return t
|
||||||
|
|
||||||
|
@TOKEN(u32char_const)
|
||||||
|
def t_U32CHAR_CONST(self, t: LexToken) -> LexToken:
|
||||||
|
return t
|
||||||
|
|
||||||
|
@TOKEN(unmatched_quote)
|
||||||
|
def t_UNMATCHED_QUOTE(self, t: LexToken) -> None:
|
||||||
|
msg = "Unmatched '"
|
||||||
|
self._error(msg, t)
|
||||||
|
|
||||||
|
@TOKEN(bad_char_const)
|
||||||
|
def t_BAD_CHAR_CONST(self, t: LexToken) -> None:
|
||||||
|
msg = "Invalid char constant %s" % t.value
|
||||||
|
self._error(msg, t)
|
||||||
|
|
||||||
|
@TOKEN(wstring_literal)
|
||||||
|
def t_WSTRING_LITERAL(self, t: LexToken) -> LexToken:
|
||||||
|
return t
|
||||||
|
|
||||||
|
@TOKEN(u8string_literal)
|
||||||
|
def t_U8STRING_LITERAL(self, t: LexToken) -> LexToken:
|
||||||
|
return t
|
||||||
|
|
||||||
|
@TOKEN(u16string_literal)
|
||||||
|
def t_U16STRING_LITERAL(self, t: LexToken) -> LexToken:
|
||||||
|
return t
|
||||||
|
|
||||||
|
@TOKEN(u32string_literal)
|
||||||
|
def t_U32STRING_LITERAL(self, t: LexToken) -> LexToken:
|
||||||
|
return t
|
||||||
|
|
||||||
|
# unmatched string literals are caught by the preprocessor
|
||||||
|
|
||||||
|
@TOKEN(bad_string_literal)
|
||||||
|
def t_BAD_STRING_LITERAL(self, t):
|
||||||
|
msg = "String contains invalid escape code"
|
||||||
|
self._error(msg, t)
|
||||||
|
|
||||||
|
@TOKEN(r"[A-Za-z_~][A-Za-z0-9_]*")
|
||||||
def t_NAME(self, t: LexToken) -> LexToken:
|
def t_NAME(self, t: LexToken) -> LexToken:
|
||||||
r"[A-Za-z_~][A-Za-z0-9_]*"
|
|
||||||
if t.value in self.keywords:
|
if t.value in self.keywords:
|
||||||
t.type = t.value
|
t.type = t.value
|
||||||
return t
|
return t
|
||||||
|
|
||||||
|
@TOKEN(r"\#.*")
|
||||||
def t_PRECOMP_MACRO(self, t: LexToken) -> typing.Optional[LexToken]:
|
def t_PRECOMP_MACRO(self, t: LexToken) -> typing.Optional[LexToken]:
|
||||||
r"\#.*"
|
|
||||||
m = _line_re.match(t.value)
|
m = _line_re.match(t.value)
|
||||||
if m:
|
if m:
|
||||||
filename = m.group(2)
|
self.filename = m.group(2)
|
||||||
if filename not in self._filenames_set:
|
|
||||||
self.filenames.append(filename)
|
|
||||||
self._filenames_set.add(filename)
|
|
||||||
self.filename = filename
|
|
||||||
|
|
||||||
self.line_offset = 1 + self.lex.lineno - int(m.group(1))
|
self.line_offset = 1 + self.lex.lineno - int(m.group(1))
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
return t
|
return t
|
||||||
|
|
||||||
def t_COMMENT_SINGLELINE(self, t: LexToken) -> LexToken:
|
|
||||||
r"\/\/.*\n?"
|
|
||||||
if t.value.startswith("///") or t.value.startswith("//!"):
|
|
||||||
self.comments.append(t.value.lstrip("\t ").rstrip("\n"))
|
|
||||||
t.lexer.lineno += t.value.count("\n")
|
|
||||||
return t
|
|
||||||
|
|
||||||
t_DIVIDE = r"/(?!/)"
|
t_DIVIDE = r"/(?!/)"
|
||||||
t_CHAR_LITERAL = "'.'"
|
|
||||||
t_ELLIPSIS = r"\.\.\."
|
t_ELLIPSIS = r"\.\.\."
|
||||||
t_DBL_LBRACKET = r"\[\["
|
t_DBL_LBRACKET = r"\[\["
|
||||||
t_DBL_RBRACKET = r"\]\]"
|
t_DBL_RBRACKET = r"\]\]"
|
||||||
@ -225,36 +454,35 @@ class Lexer:
|
|||||||
t_SHIFT_LEFT = r"<<"
|
t_SHIFT_LEFT = r"<<"
|
||||||
# SHIFT_RIGHT introduces ambiguity
|
# SHIFT_RIGHT introduces ambiguity
|
||||||
|
|
||||||
# found at http://wordaligned.org/articles/string-literals-and-regular-expressions
|
t_STRING_LITERAL = string_literal
|
||||||
# TODO: This does not work with the string "bla \" bla"
|
|
||||||
t_STRING_LITERAL = r'"([^"\\]|\\.)*"'
|
|
||||||
|
|
||||||
# Found at http://ostermiller.org/findcomment.html
|
@TOKEN(r"\/\/.*\n?")
|
||||||
def t_COMMENT_MULTILINE(self, t: LexToken) -> LexToken:
|
def t_COMMENT_SINGLELINE(self, t: LexToken) -> LexToken:
|
||||||
r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/\n?"
|
|
||||||
if t.value.startswith("/**") or t.value.startswith("/*!"):
|
|
||||||
# not sure why, but get double new lines
|
|
||||||
v = t.value.replace("\n\n", "\n")
|
|
||||||
# strip prefixing whitespace
|
|
||||||
v = _multicomment_re.sub("\n*", v)
|
|
||||||
self.comments = v.splitlines()
|
|
||||||
t.lexer.lineno += t.value.count("\n")
|
t.lexer.lineno += t.value.count("\n")
|
||||||
return t
|
return t
|
||||||
|
|
||||||
|
# Found at http://ostermiller.org/findcomment.html
|
||||||
|
@TOKEN(r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/\n?")
|
||||||
|
def t_COMMENT_MULTILINE(self, t: LexToken) -> LexToken:
|
||||||
|
t.lexer.lineno += t.value.count("\n")
|
||||||
|
return t
|
||||||
|
|
||||||
|
@TOKEN(r"\n+")
|
||||||
def t_NEWLINE(self, t: LexToken) -> LexToken:
|
def t_NEWLINE(self, t: LexToken) -> LexToken:
|
||||||
r"\n+"
|
|
||||||
t.lexer.lineno += len(t.value)
|
t.lexer.lineno += len(t.value)
|
||||||
del self.comments[:]
|
|
||||||
return t
|
return t
|
||||||
|
|
||||||
def t_error(self, t: LexToken) -> None:
|
def t_error(self, t: LexToken) -> None:
|
||||||
print("Lex error: ", t)
|
self._error(f"Illegal character {t.value!r}", t)
|
||||||
|
|
||||||
|
def _error(self, msg: str, tok: LexToken):
|
||||||
|
tok.location = self.current_location()
|
||||||
|
raise LexError(msg, tok)
|
||||||
|
|
||||||
_lexer = None
|
_lexer = None
|
||||||
lex: lex.Lexer
|
lex: lex.Lexer
|
||||||
lineno: int
|
|
||||||
|
|
||||||
def __new__(cls, *args, **kwargs) -> "Lexer":
|
def __new__(cls, *args, **kwargs) -> "PlyLexer":
|
||||||
# only build the lexer once
|
# only build the lexer once
|
||||||
inst = super().__new__(cls)
|
inst = super().__new__(cls)
|
||||||
if cls._lexer is None:
|
if cls._lexer is None:
|
||||||
@ -266,164 +494,80 @@ class Lexer:
|
|||||||
|
|
||||||
def __init__(self, filename: typing.Optional[str] = None):
|
def __init__(self, filename: typing.Optional[str] = None):
|
||||||
self.input: typing.Callable[[str], None] = self.lex.input
|
self.input: typing.Callable[[str], None] = self.lex.input
|
||||||
|
self.token: typing.Callable[[], LexToken] = self.lex.token
|
||||||
|
|
||||||
# For tracking current file/line position
|
# For tracking current file/line position
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.line_offset = 0
|
self.line_offset = 0
|
||||||
|
|
||||||
self.filenames: typing.List[str] = []
|
def current_location(self) -> Location:
|
||||||
self._filenames_set: typing.Set[str] = set()
|
return Location(self.filename, self.lex.lineno - self.line_offset)
|
||||||
|
|
||||||
if filename:
|
|
||||||
self.filenames.append(filename)
|
|
||||||
self._filenames_set.add(filename)
|
|
||||||
|
|
||||||
# Doxygen comments
|
class TokenStream:
|
||||||
self.comments = []
|
"""
|
||||||
|
Provides access to a stream of tokens
|
||||||
|
"""
|
||||||
|
|
||||||
self.lookahead = typing.Deque[LexToken]()
|
tokbuf: typing.Deque[LexToken]
|
||||||
|
|
||||||
# For 'set_group_of_tokens' support
|
def _fill_tokbuf(self, tokbuf: typing.Deque[LexToken]) -> bool:
|
||||||
self._get_token: typing.Callable[[], LexToken] = self.lex.token
|
"""
|
||||||
self.lookahead_stack = typing.Deque[typing.Deque[LexToken]]()
|
Fills tokbuf with tokens from the next line. Return True if at least
|
||||||
|
one token was added to the buffer
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
def current_location(self) -> Location:
|
def current_location(self) -> Location:
|
||||||
if self.lookahead:
|
raise NotImplementedError
|
||||||
return self.lookahead[0].location
|
|
||||||
return Location(self.filename, self.lex.lineno - self.line_offset)
|
|
||||||
|
|
||||||
def get_doxygen(self) -> typing.Optional[str]:
|
def get_doxygen(self) -> typing.Optional[str]:
|
||||||
"""
|
"""
|
||||||
This should be called after the first element of something has
|
This is called at the point that you want doxygen information
|
||||||
been consumed.
|
|
||||||
|
|
||||||
It will lookahead for comments that come after the item, if prior
|
|
||||||
comments don't exist.
|
|
||||||
"""
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
# Assumption: This function is either called at the beginning of a
|
def get_doxygen_after(self) -> typing.Optional[str]:
|
||||||
# statement or at the end of a statement
|
"""
|
||||||
|
This is called to retrieve doxygen information after a statement
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
if self.comments:
|
_discard_types = {
|
||||||
comments = self.comments
|
"NEWLINE",
|
||||||
else:
|
"COMMENT_SINGLELINE",
|
||||||
comments = []
|
"COMMENT_MULTILINE",
|
||||||
# only look for comments until a newline (including lookahead)
|
"WHITESPACE",
|
||||||
for tok in self.lookahead:
|
}
|
||||||
if tok.type == "NEWLINE":
|
|
||||||
return None
|
|
||||||
|
|
||||||
while True:
|
|
||||||
tok = self._get_token()
|
|
||||||
comments.extend(self.comments)
|
|
||||||
|
|
||||||
if tok is None:
|
|
||||||
break
|
|
||||||
|
|
||||||
tok.location = Location(self.filename, tok.lineno - self.line_offset)
|
|
||||||
ttype = tok.type
|
|
||||||
if ttype == "NEWLINE":
|
|
||||||
self.lookahead.append(tok)
|
|
||||||
break
|
|
||||||
|
|
||||||
if ttype not in self._discard_types:
|
|
||||||
self.lookahead.append(tok)
|
|
||||||
|
|
||||||
if ttype == "NAME":
|
|
||||||
break
|
|
||||||
|
|
||||||
del self.comments[:]
|
|
||||||
|
|
||||||
comment_str = "\n".join(comments)
|
|
||||||
del self.comments[:]
|
|
||||||
if comment_str:
|
|
||||||
return comment_str
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
_discard_types = {"NEWLINE", "COMMENT_SINGLELINE", "COMMENT_MULTILINE"}
|
|
||||||
|
|
||||||
def _token_limit_exceeded(self) -> typing.NoReturn:
|
|
||||||
from .errors import CxxParseError
|
|
||||||
|
|
||||||
raise CxxParseError("no more tokens left in this group")
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
|
||||||
def set_group_of_tokens(
|
|
||||||
self, toks: typing.List[LexToken]
|
|
||||||
) -> typing.Generator[typing.Deque[LexToken], None, None]:
|
|
||||||
# intended for use when you have a set of tokens that you know
|
|
||||||
# must be consumed, such as a paren grouping or some type of
|
|
||||||
# lookahead case
|
|
||||||
|
|
||||||
stack = self.lookahead_stack
|
|
||||||
restore_fn = False
|
|
||||||
|
|
||||||
if not stack:
|
|
||||||
restore_fn = True
|
|
||||||
self._get_token = self._token_limit_exceeded
|
|
||||||
|
|
||||||
this_buf = typing.Deque[LexToken](toks)
|
|
||||||
prev_buf = self.lookahead
|
|
||||||
stack.append(prev_buf)
|
|
||||||
self.lookahead = this_buf
|
|
||||||
|
|
||||||
try:
|
|
||||||
yield this_buf
|
|
||||||
finally:
|
|
||||||
buf = stack.pop()
|
|
||||||
if prev_buf is not buf:
|
|
||||||
raise ValueError("internal error")
|
|
||||||
|
|
||||||
self.lookahead = prev_buf
|
|
||||||
|
|
||||||
if restore_fn:
|
|
||||||
self._get_token = self.lex.token
|
|
||||||
|
|
||||||
def token(self) -> LexToken:
|
def token(self) -> LexToken:
|
||||||
tok = None
|
tokbuf = self.tokbuf
|
||||||
while self.lookahead:
|
while True:
|
||||||
tok = self.lookahead.popleft()
|
while tokbuf:
|
||||||
|
tok = tokbuf.popleft()
|
||||||
if tok.type not in self._discard_types:
|
if tok.type not in self._discard_types:
|
||||||
return tok
|
return tok
|
||||||
|
|
||||||
while True:
|
if not self._fill_tokbuf(tokbuf):
|
||||||
tok = self._get_token()
|
|
||||||
if tok is None:
|
|
||||||
raise EOFError("unexpected end of file")
|
raise EOFError("unexpected end of file")
|
||||||
|
|
||||||
if tok.type not in self._discard_types:
|
|
||||||
tok.location = Location(self.filename, tok.lineno - self.line_offset)
|
|
||||||
break
|
|
||||||
|
|
||||||
return tok
|
|
||||||
|
|
||||||
def token_eof_ok(self) -> typing.Optional[LexToken]:
|
def token_eof_ok(self) -> typing.Optional[LexToken]:
|
||||||
tok = None
|
tokbuf = self.tokbuf
|
||||||
while self.lookahead:
|
|
||||||
tok = self.lookahead.popleft()
|
|
||||||
if tok.type not in self._discard_types:
|
|
||||||
return tok
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
tok = self._get_token()
|
while tokbuf:
|
||||||
if tok is None:
|
tok = tokbuf.popleft()
|
||||||
break
|
|
||||||
|
|
||||||
if tok.type not in self._discard_types:
|
if tok.type not in self._discard_types:
|
||||||
tok.location = Location(self.filename, tok.lineno - self.line_offset)
|
|
||||||
break
|
|
||||||
|
|
||||||
return tok
|
return tok
|
||||||
|
|
||||||
|
if not self._fill_tokbuf(tokbuf):
|
||||||
|
return None
|
||||||
|
|
||||||
def token_if(self, *types: str) -> typing.Optional[LexToken]:
|
def token_if(self, *types: str) -> typing.Optional[LexToken]:
|
||||||
tok = self.token_eof_ok()
|
tok = self.token_eof_ok()
|
||||||
if tok is None:
|
if tok is None:
|
||||||
return None
|
return None
|
||||||
if tok.type not in types:
|
if tok.type not in types:
|
||||||
# put it back on the left in case it was retrieved
|
self.tokbuf.appendleft(tok)
|
||||||
# from the lookahead buffer
|
|
||||||
self.lookahead.appendleft(tok)
|
|
||||||
return None
|
return None
|
||||||
return tok
|
return tok
|
||||||
|
|
||||||
@ -432,9 +576,7 @@ class Lexer:
|
|||||||
if tok is None:
|
if tok is None:
|
||||||
return None
|
return None
|
||||||
if tok.type not in types:
|
if tok.type not in types:
|
||||||
# put it back on the left in case it was retrieved
|
self.tokbuf.appendleft(tok)
|
||||||
# from the lookahead buffer
|
|
||||||
self.lookahead.appendleft(tok)
|
|
||||||
return None
|
return None
|
||||||
return tok
|
return tok
|
||||||
|
|
||||||
@ -443,9 +585,7 @@ class Lexer:
|
|||||||
if tok is None:
|
if tok is None:
|
||||||
return None
|
return None
|
||||||
if tok.value not in vals:
|
if tok.value not in vals:
|
||||||
# put it back on the left in case it was retrieved
|
self.tokbuf.appendleft(tok)
|
||||||
# from the lookahead buffer
|
|
||||||
self.lookahead.appendleft(tok)
|
|
||||||
return None
|
return None
|
||||||
return tok
|
return tok
|
||||||
|
|
||||||
@ -454,9 +594,7 @@ class Lexer:
|
|||||||
if tok is None:
|
if tok is None:
|
||||||
return None
|
return None
|
||||||
if tok.type in types:
|
if tok.type in types:
|
||||||
# put it back on the left in case it was retrieved
|
self.tokbuf.appendleft(tok)
|
||||||
# from the lookahead buffer
|
|
||||||
self.lookahead.appendleft(tok)
|
|
||||||
return None
|
return None
|
||||||
return tok
|
return tok
|
||||||
|
|
||||||
@ -464,18 +602,217 @@ class Lexer:
|
|||||||
tok = self.token_eof_ok()
|
tok = self.token_eof_ok()
|
||||||
if not tok:
|
if not tok:
|
||||||
return False
|
return False
|
||||||
self.lookahead.appendleft(tok)
|
self.tokbuf.appendleft(tok)
|
||||||
return tok.type in types
|
return tok.type in types
|
||||||
|
|
||||||
def return_token(self, tok: LexToken) -> None:
|
def return_token(self, tok: LexToken) -> None:
|
||||||
self.lookahead.appendleft(tok)
|
self.tokbuf.appendleft(tok)
|
||||||
|
|
||||||
def return_tokens(self, toks: typing.Sequence[LexToken]) -> None:
|
def return_tokens(self, toks: typing.Sequence[LexToken]) -> None:
|
||||||
self.lookahead.extendleft(reversed(toks))
|
self.tokbuf.extendleft(reversed(toks))
|
||||||
|
|
||||||
|
|
||||||
|
class LexerTokenStream(TokenStream):
|
||||||
|
"""
|
||||||
|
Provides tokens from using PlyLexer on the given input text
|
||||||
|
"""
|
||||||
|
|
||||||
|
_user_defined_literal_start = {
|
||||||
|
"FLOAT_CONST",
|
||||||
|
"HEX_FLOAT_CONST",
|
||||||
|
"INT_CONST_HEX",
|
||||||
|
"INT_CONST_BIN",
|
||||||
|
"INT_CONST_OCT",
|
||||||
|
"INT_CONST_DEC",
|
||||||
|
"INT_CONST_CHAR",
|
||||||
|
"CHAR_CONST",
|
||||||
|
"WCHAR_CONST",
|
||||||
|
"U8CHAR_CONST",
|
||||||
|
"U16CHAR_CONST",
|
||||||
|
"U32CHAR_CONST",
|
||||||
|
# String literals
|
||||||
|
"STRING_LITERAL",
|
||||||
|
"WSTRING_LITERAL",
|
||||||
|
"U8STRING_LITERAL",
|
||||||
|
"U16STRING_LITERAL",
|
||||||
|
"U32STRING_LITERAL",
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, filename: typing.Optional[str], content: str) -> None:
|
||||||
|
self._lex = PlyLexer(filename)
|
||||||
|
self._lex.input(content)
|
||||||
|
self.tokbuf = typing.Deque[LexToken]()
|
||||||
|
|
||||||
|
def _fill_tokbuf(self, tokbuf: typing.Deque[LexToken]) -> bool:
|
||||||
|
get_token = self._lex.token
|
||||||
|
tokbuf = self.tokbuf
|
||||||
|
|
||||||
|
tok = get_token()
|
||||||
|
if tok is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
udl_start = self._user_defined_literal_start
|
||||||
|
|
||||||
|
while True:
|
||||||
|
tok.location = self._lex.current_location()
|
||||||
|
tokbuf.append(tok)
|
||||||
|
|
||||||
|
if tok.type == "NEWLINE":
|
||||||
|
break
|
||||||
|
|
||||||
|
# detect/combine user defined literals
|
||||||
|
if tok.type in udl_start:
|
||||||
|
tok2 = get_token()
|
||||||
|
if tok2 is None:
|
||||||
|
break
|
||||||
|
|
||||||
|
if tok2.type != "NAME" or tok2.value[0] != "_":
|
||||||
|
tok = tok2
|
||||||
|
continue
|
||||||
|
|
||||||
|
tok.value = tok.value + tok2.value
|
||||||
|
tok.type = f"UD_{tok.type}"
|
||||||
|
|
||||||
|
tok = get_token()
|
||||||
|
if tok is None:
|
||||||
|
break
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def current_location(self) -> Location:
|
||||||
|
if self.tokbuf:
|
||||||
|
return self.tokbuf[0].location
|
||||||
|
return self._lex.current_location()
|
||||||
|
|
||||||
|
def get_doxygen(self) -> typing.Optional[str]:
|
||||||
|
|
||||||
|
tokbuf = self.tokbuf
|
||||||
|
|
||||||
|
# fill the token buffer if it's empty (which indicates a newline)
|
||||||
|
if not tokbuf and not self._fill_tokbuf(tokbuf):
|
||||||
|
return None
|
||||||
|
|
||||||
|
comments: typing.List[LexToken] = []
|
||||||
|
|
||||||
|
# retrieve any comments in the stream right before
|
||||||
|
# the first non-discard element
|
||||||
|
keep_going = True
|
||||||
|
while True:
|
||||||
|
while tokbuf:
|
||||||
|
tok = tokbuf.popleft()
|
||||||
|
if tok.type == "NEWLINE":
|
||||||
|
comments.clear()
|
||||||
|
elif tok.type == "WHITESPACE":
|
||||||
|
pass
|
||||||
|
elif tok.type in ("COMMENT_SINGLELINE", "COMMENT_MULTILINE"):
|
||||||
|
comments.append(tok)
|
||||||
|
else:
|
||||||
|
tokbuf.appendleft(tok)
|
||||||
|
keep_going = False
|
||||||
|
break
|
||||||
|
|
||||||
|
if not keep_going:
|
||||||
|
break
|
||||||
|
|
||||||
|
if not self._fill_tokbuf(tokbuf):
|
||||||
|
break
|
||||||
|
|
||||||
|
if comments:
|
||||||
|
return self._extract_comments(comments)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_doxygen_after(self) -> typing.Optional[str]:
|
||||||
|
tokbuf = self.tokbuf
|
||||||
|
|
||||||
|
# if there's a newline directly after a statement, we're done
|
||||||
|
if not tokbuf:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# retrieve comments after non-discard elements
|
||||||
|
comments: typing.List[LexToken] = []
|
||||||
|
new_tokbuf = typing.Deque[LexToken]()
|
||||||
|
|
||||||
|
# This is different: we only extract tokens here
|
||||||
|
while tokbuf:
|
||||||
|
tok = tokbuf.popleft()
|
||||||
|
if tok.type == "NEWLINE":
|
||||||
|
break
|
||||||
|
elif tok.type == "WHITESPACE":
|
||||||
|
new_tokbuf.append(tok)
|
||||||
|
elif tok.type in ("COMMENT_SINGLELINE", "COMMENT_MULTILINE"):
|
||||||
|
comments.append(tok)
|
||||||
|
else:
|
||||||
|
new_tokbuf.append(tok)
|
||||||
|
if comments:
|
||||||
|
break
|
||||||
|
|
||||||
|
new_tokbuf.extend(tokbuf)
|
||||||
|
self.tokbuf = new_tokbuf
|
||||||
|
|
||||||
|
if comments:
|
||||||
|
return self._extract_comments(comments)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _extract_comments(self, comments: typing.List[LexToken]):
|
||||||
|
# Now we have comments, need to extract the text from them
|
||||||
|
comment_lines: typing.List[str] = []
|
||||||
|
for c in comments:
|
||||||
|
text = c.value
|
||||||
|
if c.type == "COMMENT_SINGLELINE":
|
||||||
|
if text.startswith("///") or text.startswith("//!"):
|
||||||
|
comment_lines.append(text.rstrip("\n"))
|
||||||
|
else:
|
||||||
|
if text.startswith("/**") or text.startswith("/*!"):
|
||||||
|
# not sure why, but get double new lines
|
||||||
|
text = text.replace("\n\n", "\n")
|
||||||
|
# strip prefixing whitespace
|
||||||
|
text = _multicomment_re.sub("\n*", text)
|
||||||
|
comment_lines = text.splitlines()
|
||||||
|
|
||||||
|
comment_str = "\n".join(comment_lines)
|
||||||
|
if comment_str:
|
||||||
|
return comment_str
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class BoundedTokenStream(TokenStream):
|
||||||
|
"""
|
||||||
|
Provides tokens from a fixed list of tokens.
|
||||||
|
|
||||||
|
Intended for use when you have a group of tokens that you know
|
||||||
|
must be consumed, such as a paren grouping or some type of
|
||||||
|
lookahead case
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, toks: typing.List[LexToken]) -> None:
|
||||||
|
self.tokbuf = typing.Deque[LexToken](toks)
|
||||||
|
|
||||||
|
def has_tokens(self) -> bool:
|
||||||
|
return len(self.tokbuf) > 0
|
||||||
|
|
||||||
|
def _fill_tokbuf(self, tokbuf: typing.Deque[LexToken]) -> bool:
|
||||||
|
from .errors import CxxParseError
|
||||||
|
|
||||||
|
raise CxxParseError("no more tokens left in this group")
|
||||||
|
|
||||||
|
def current_location(self) -> Location:
|
||||||
|
if self.tokbuf:
|
||||||
|
return self.tokbuf[0].location
|
||||||
|
raise ValueError("internal error")
|
||||||
|
|
||||||
|
def get_doxygen(self) -> typing.Optional[str]:
|
||||||
|
# comment tokens aren't going to be in this stream
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_doxygen_after(self) -> typing.Optional[str]:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__": # pragma: no cover
|
if __name__ == "__main__": # pragma: no cover
|
||||||
try:
|
try:
|
||||||
lex.runmain(lexer=Lexer(None))
|
lex.runmain(lexer=PlyLexer(None))
|
||||||
except EOFError:
|
except EOFError:
|
||||||
pass
|
pass
|
||||||
|
@ -4,8 +4,9 @@ import inspect
|
|||||||
import re
|
import re
|
||||||
import typing
|
import typing
|
||||||
|
|
||||||
|
from . import lexer
|
||||||
from .errors import CxxParseError
|
from .errors import CxxParseError
|
||||||
from .lexer import Lexer, LexToken, Location, PhonyEnding
|
from .lexer import LexToken, Location, PhonyEnding
|
||||||
from .options import ParserOptions
|
from .options import ParserOptions
|
||||||
from .parserstate import (
|
from .parserstate import (
|
||||||
ClassBlockState,
|
ClassBlockState,
|
||||||
@ -80,8 +81,7 @@ class CxxParser:
|
|||||||
self.visitor = visitor
|
self.visitor = visitor
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
|
|
||||||
self.lex = Lexer(filename)
|
self.lex: lexer.TokenStream = lexer.LexerTokenStream(filename, content)
|
||||||
self.lex.input(content)
|
|
||||||
|
|
||||||
global_ns = NamespaceDecl([], False)
|
global_ns = NamespaceDecl([], False)
|
||||||
self.current_namespace = global_ns
|
self.current_namespace = global_ns
|
||||||
@ -308,25 +308,34 @@ class CxxParser:
|
|||||||
";": lambda _1, _2: None,
|
";": lambda _1, _2: None,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_keep_doxygen = {"__declspec", "alignas", "__attribute__", "DBL_LBRACKET"}
|
||||||
|
|
||||||
tok = None
|
tok = None
|
||||||
|
|
||||||
get_token_eof_ok = self.lex.token_eof_ok
|
get_token_eof_ok = self.lex.token_eof_ok
|
||||||
get_doxygen = self.lex.get_doxygen
|
get_doxygen = self.lex.get_doxygen
|
||||||
|
|
||||||
|
doxygen = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
|
if doxygen is None:
|
||||||
|
doxygen = get_doxygen()
|
||||||
|
|
||||||
tok = get_token_eof_ok()
|
tok = get_token_eof_ok()
|
||||||
if not tok:
|
if not tok:
|
||||||
break
|
break
|
||||||
|
|
||||||
doxygen = get_doxygen()
|
|
||||||
|
|
||||||
fn = _translation_unit_tokens.get(tok.type)
|
fn = _translation_unit_tokens.get(tok.type)
|
||||||
if fn:
|
if fn:
|
||||||
fn(tok, doxygen)
|
fn(tok, doxygen)
|
||||||
|
|
||||||
|
if tok.type not in _keep_doxygen:
|
||||||
|
doxygen = None
|
||||||
else:
|
else:
|
||||||
# this processes ambiguous declarations
|
# this processes ambiguous declarations
|
||||||
self._parse_declarations(tok, doxygen)
|
self._parse_declarations(tok, doxygen)
|
||||||
|
doxygen = None
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
@ -610,7 +619,12 @@ class CxxParser:
|
|||||||
# append a token to make other parsing components happy
|
# append a token to make other parsing components happy
|
||||||
raw_toks.append(PhonyEnding)
|
raw_toks.append(PhonyEnding)
|
||||||
|
|
||||||
with self.lex.set_group_of_tokens(raw_toks) as remainder:
|
old_lex = self.lex
|
||||||
|
try:
|
||||||
|
# set up a temporary token stream with the tokens we need to parse
|
||||||
|
tmp_lex = lexer.BoundedTokenStream(raw_toks)
|
||||||
|
self.lex = tmp_lex
|
||||||
|
|
||||||
try:
|
try:
|
||||||
parsed_type, mods = self._parse_type(None)
|
parsed_type, mods = self._parse_type(None)
|
||||||
if parsed_type is None:
|
if parsed_type is None:
|
||||||
@ -622,9 +636,12 @@ class CxxParser:
|
|||||||
except CxxParseError:
|
except CxxParseError:
|
||||||
dtype = None
|
dtype = None
|
||||||
else:
|
else:
|
||||||
if remainder:
|
if tmp_lex.has_tokens():
|
||||||
dtype = None
|
dtype = None
|
||||||
|
|
||||||
|
finally:
|
||||||
|
self.lex = old_lex
|
||||||
|
|
||||||
if self.lex.token_if("ELLIPSIS"):
|
if self.lex.token_if("ELLIPSIS"):
|
||||||
param_pack = True
|
param_pack = True
|
||||||
|
|
||||||
@ -939,12 +956,16 @@ class CxxParser:
|
|||||||
values: typing.List[Enumerator] = []
|
values: typing.List[Enumerator] = []
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
doxygen = self.lex.get_doxygen()
|
||||||
|
|
||||||
name_tok = self._next_token_must_be("}", "NAME")
|
name_tok = self._next_token_must_be("}", "NAME")
|
||||||
if name_tok.value == "}":
|
if name_tok.value == "}":
|
||||||
break
|
break
|
||||||
|
|
||||||
|
if doxygen is None:
|
||||||
|
doxygen = self.lex.get_doxygen_after()
|
||||||
|
|
||||||
name = name_tok.value
|
name = name_tok.value
|
||||||
doxygen = self.lex.get_doxygen()
|
|
||||||
value = None
|
value = None
|
||||||
|
|
||||||
tok = self._next_token_must_be("}", ",", "=", "DBL_LBRACKET")
|
tok = self._next_token_must_be("}", ",", "=", "DBL_LBRACKET")
|
||||||
@ -1171,7 +1192,7 @@ class CxxParser:
|
|||||||
|
|
||||||
def _parse_bitfield(self) -> int:
|
def _parse_bitfield(self) -> int:
|
||||||
# is a integral constant expression... for now, just do integers
|
# is a integral constant expression... for now, just do integers
|
||||||
tok = self._next_token_must_be("NUMBER")
|
tok = self._next_token_must_be("INT_CONST_DEC")
|
||||||
return int(tok.value)
|
return int(tok.value)
|
||||||
|
|
||||||
def _parse_field(
|
def _parse_field(
|
||||||
@ -1244,7 +1265,7 @@ class CxxParser:
|
|||||||
|
|
||||||
if doxygen is None:
|
if doxygen is None:
|
||||||
# try checking after the var
|
# try checking after the var
|
||||||
doxygen = self.lex.get_doxygen()
|
doxygen = self.lex.get_doxygen_after()
|
||||||
|
|
||||||
if is_typedef:
|
if is_typedef:
|
||||||
if not name:
|
if not name:
|
||||||
|
@ -1,15 +1,28 @@
|
|||||||
import typing
|
import typing
|
||||||
|
|
||||||
from .lexer import LexToken, Lexer
|
from .lexer import LexToken, PlyLexer, LexerTokenStream
|
||||||
from .types import Token
|
from .types import Token
|
||||||
|
|
||||||
# key: token type, value: (left spacing, right spacing)
|
# key: token type, value: (left spacing, right spacing)
|
||||||
_want_spacing = {
|
_want_spacing = {
|
||||||
"NUMBER": (2, 2),
|
"FLOAT_CONST": (2, 2),
|
||||||
"FLOAT_NUMBER": (2, 2),
|
"HEX_FLOAT_CONST": (2, 2),
|
||||||
|
"INT_CONST_HEX": (2, 2),
|
||||||
|
"INT_CONST_BIN": (2, 2),
|
||||||
|
"INT_CONST_OCT": (2, 2),
|
||||||
|
"INT_CONST_DEC": (2, 2),
|
||||||
|
"INT_CONST_CHAR": (2, 2),
|
||||||
"NAME": (2, 2),
|
"NAME": (2, 2),
|
||||||
"CHAR_LITERAL": (2, 2),
|
"CHAR_CONST": (2, 2),
|
||||||
|
"WCHAR_CONST": (2, 2),
|
||||||
|
"U8CHAR_CONST": (2, 2),
|
||||||
|
"U16CHAR_CONST": (2, 2),
|
||||||
|
"U32CHAR_CONST": (2, 2),
|
||||||
"STRING_LITERAL": (2, 2),
|
"STRING_LITERAL": (2, 2),
|
||||||
|
"WSTRING_LITERAL": (2, 2),
|
||||||
|
"U8STRING_LITERAL": (2, 2),
|
||||||
|
"U16STRING_LITERAL": (2, 2),
|
||||||
|
"U32STRING_LITERAL": (2, 2),
|
||||||
"ELLIPSIS": (2, 2),
|
"ELLIPSIS": (2, 2),
|
||||||
">": (0, 2),
|
">": (0, 2),
|
||||||
")": (0, 1),
|
")": (0, 1),
|
||||||
@ -19,7 +32,7 @@ _want_spacing = {
|
|||||||
"&": (0, 2),
|
"&": (0, 2),
|
||||||
}
|
}
|
||||||
|
|
||||||
_want_spacing.update(dict.fromkeys(Lexer.keywords, (2, 2)))
|
_want_spacing.update(dict.fromkeys(PlyLexer.keywords, (2, 2)))
|
||||||
|
|
||||||
|
|
||||||
def tokfmt(toks: typing.List[Token]) -> str:
|
def tokfmt(toks: typing.List[Token]) -> str:
|
||||||
@ -54,9 +67,9 @@ if __name__ == "__main__": # pragma: no cover
|
|||||||
parser.add_argument("header")
|
parser.add_argument("header")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
lexer = Lexer(args.header)
|
filename: str = args.header
|
||||||
with open(lexer.filename) as fp:
|
with open(filename) as fp:
|
||||||
lexer.input(fp.read()) # type: ignore
|
lexer = LexerTokenStream(filename, fp.read())
|
||||||
|
|
||||||
toks: typing.List[Token] = []
|
toks: typing.List[Token] = []
|
||||||
while True:
|
while True:
|
||||||
|
@ -209,16 +209,13 @@ def test_doxygen_fn_3slash() -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_doxygen_fn_cstyle() -> None:
|
def test_doxygen_fn_cstyle1() -> None:
|
||||||
content = """
|
content = """
|
||||||
// clang-format off
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* fn comment
|
* fn comment
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
fn();
|
fn();
|
||||||
|
|
||||||
"""
|
"""
|
||||||
data = parse_string(content, cleandoc=True)
|
data = parse_string(content, cleandoc=True)
|
||||||
|
|
||||||
@ -238,6 +235,32 @@ def test_doxygen_fn_cstyle() -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_doxygen_fn_cstyle2() -> None:
|
||||||
|
content = """
|
||||||
|
/*!
|
||||||
|
* fn comment
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
fn();
|
||||||
|
"""
|
||||||
|
data = parse_string(content, cleandoc=True)
|
||||||
|
|
||||||
|
assert data == ParsedData(
|
||||||
|
namespace=NamespaceScope(
|
||||||
|
functions=[
|
||||||
|
Function(
|
||||||
|
return_type=Type(
|
||||||
|
typename=PQName(segments=[FundamentalSpecifier(name="void")])
|
||||||
|
),
|
||||||
|
name=PQName(segments=[NameSpecifier(name="fn")]),
|
||||||
|
parameters=[],
|
||||||
|
doxygen="/*!\n* fn comment\n*/",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_doxygen_var_above() -> None:
|
def test_doxygen_var_above() -> None:
|
||||||
content = """
|
content = """
|
||||||
// clang-format off
|
// clang-format off
|
||||||
@ -292,6 +315,44 @@ def test_doxygen_var_after() -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_doxygen_multiple_variables() -> None:
|
||||||
|
content = """
|
||||||
|
int x; /// this is x
|
||||||
|
int y; /// this is y
|
||||||
|
/// this is also y
|
||||||
|
int z; /// this is z
|
||||||
|
"""
|
||||||
|
data = parse_string(content, cleandoc=True)
|
||||||
|
|
||||||
|
assert data == ParsedData(
|
||||||
|
namespace=NamespaceScope(
|
||||||
|
variables=[
|
||||||
|
Variable(
|
||||||
|
name=PQName(segments=[NameSpecifier(name="x")]),
|
||||||
|
type=Type(
|
||||||
|
typename=PQName(segments=[FundamentalSpecifier(name="int")])
|
||||||
|
),
|
||||||
|
doxygen="/// this is x",
|
||||||
|
),
|
||||||
|
Variable(
|
||||||
|
name=PQName(segments=[NameSpecifier(name="y")]),
|
||||||
|
type=Type(
|
||||||
|
typename=PQName(segments=[FundamentalSpecifier(name="int")])
|
||||||
|
),
|
||||||
|
doxygen="/// this is y\n/// this is also y",
|
||||||
|
),
|
||||||
|
Variable(
|
||||||
|
name=PQName(segments=[NameSpecifier(name="z")]),
|
||||||
|
type=Type(
|
||||||
|
typename=PQName(segments=[FundamentalSpecifier(name="int")])
|
||||||
|
),
|
||||||
|
doxygen="/// this is z",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_doxygen_namespace() -> None:
|
def test_doxygen_namespace() -> None:
|
||||||
content = """
|
content = """
|
||||||
/**
|
/**
|
||||||
@ -329,3 +390,50 @@ def test_doxygen_namespace() -> None:
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_doxygen_declspec() -> None:
|
||||||
|
content = """
|
||||||
|
/// declspec comment
|
||||||
|
__declspec(thread) int i = 1;
|
||||||
|
"""
|
||||||
|
data = parse_string(content, cleandoc=True)
|
||||||
|
|
||||||
|
assert data == ParsedData(
|
||||||
|
namespace=NamespaceScope(
|
||||||
|
variables=[
|
||||||
|
Variable(
|
||||||
|
name=PQName(segments=[NameSpecifier(name="i")]),
|
||||||
|
type=Type(
|
||||||
|
typename=PQName(segments=[FundamentalSpecifier(name="int")])
|
||||||
|
),
|
||||||
|
value=Value(tokens=[Token(value="1")]),
|
||||||
|
doxygen="/// declspec comment",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_doxygen_attribute() -> None:
|
||||||
|
content = """
|
||||||
|
/// hasattr comment
|
||||||
|
[[nodiscard]]
|
||||||
|
int hasattr();
|
||||||
|
"""
|
||||||
|
data = parse_string(content, cleandoc=True)
|
||||||
|
|
||||||
|
assert data == ParsedData(
|
||||||
|
namespace=NamespaceScope(
|
||||||
|
functions=[
|
||||||
|
Function(
|
||||||
|
return_type=Type(
|
||||||
|
typename=PQName(segments=[FundamentalSpecifier(name="int")])
|
||||||
|
),
|
||||||
|
name=PQName(segments=[NameSpecifier(name="hasattr")]),
|
||||||
|
parameters=[],
|
||||||
|
doxygen="/// hasattr comment",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
@ -236,3 +236,34 @@ def test_final() -> None:
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# User defined literals
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
def test_user_defined_literal() -> None:
|
||||||
|
content = """
|
||||||
|
units::volt_t v = 1_V;
|
||||||
|
"""
|
||||||
|
data = parse_string(content, cleandoc=True)
|
||||||
|
|
||||||
|
assert data == ParsedData(
|
||||||
|
namespace=NamespaceScope(
|
||||||
|
variables=[
|
||||||
|
Variable(
|
||||||
|
name=PQName(segments=[NameSpecifier(name="v")]),
|
||||||
|
type=Type(
|
||||||
|
typename=PQName(
|
||||||
|
segments=[
|
||||||
|
NameSpecifier(name="units"),
|
||||||
|
NameSpecifier(name="volt_t"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
value=Value(tokens=[Token(value="1_V")]),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from cxxheaderparser.lexer import Lexer
|
from cxxheaderparser.lexer import PlyLexer, LexerTokenStream
|
||||||
from cxxheaderparser.tokfmt import tokfmt
|
from cxxheaderparser.tokfmt import tokfmt
|
||||||
from cxxheaderparser.types import Token
|
from cxxheaderparser.types import Token
|
||||||
|
|
||||||
@ -40,14 +40,15 @@ def test_tokfmt(instr: str) -> None:
|
|||||||
Each input string is exactly what the output of tokfmt should be
|
Each input string is exactly what the output of tokfmt should be
|
||||||
"""
|
"""
|
||||||
toks = []
|
toks = []
|
||||||
lexer = Lexer("")
|
lexer = PlyLexer("")
|
||||||
lexer.input(instr)
|
lexer.input(instr)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
tok = lexer.token_eof_ok()
|
tok = lexer.token()
|
||||||
if not tok:
|
if not tok:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
if tok.type not in LexerTokenStream._discard_types:
|
||||||
toks.append(Token(tok.value, tok.type))
|
toks.append(Token(tok.value, tok.type))
|
||||||
|
|
||||||
assert tokfmt(toks) == instr
|
assert tokfmt(toks) == instr
|
||||||
|
Loading…
x
Reference in New Issue
Block a user