Restructure preprocessor support

- Remove partial support for #define and other PP directives
- Allow pragma to span multiple lines
- Pragma now emits a list of tokens instead of a single string
- Ignore #warning directive if present
This commit is contained in:
Dustin Spicuzza 2023-07-23 17:03:04 -04:00
parent b07e1f81a6
commit bd9907ad79
5 changed files with 136 additions and 61 deletions

View File

@ -51,6 +51,7 @@ class LexToken(Protocol):
#: private #: private
lexer: lex.Lexer lexer: lex.Lexer
lexmatch: "re.Match"
PhonyEnding: LexToken = lex.LexToken() # type: ignore PhonyEnding: LexToken = lex.LexToken() # type: ignore
@ -175,7 +176,10 @@ class PlyLexer:
# Comments # Comments
"COMMENT_SINGLELINE", "COMMENT_SINGLELINE",
"COMMENT_MULTILINE", "COMMENT_MULTILINE",
"PRECOMP_MACRO", "LINE_DIRECTIVE",
"PRAGMA_DIRECTIVE",
"INCLUDE_DIRECTIVE",
"PP_DIRECTIVE",
# misc # misc
"DIVIDE", "DIVIDE",
"NEWLINE", "NEWLINE",
@ -434,17 +438,36 @@ class PlyLexer:
t.type = t.value t.type = t.value
return t return t
@TOKEN(r"\#.*") @TOKEN(r'\#[\t ]*line (\d+) "(.*)"')
def t_PRECOMP_MACRO(self, t: LexToken) -> typing.Optional[LexToken]: def t_LINE_DIRECTIVE(self, t: LexToken) -> None:
m = _line_re.match(t.value) m = t.lexmatch
if m:
self.filename = m.group(2) self.filename = m.group(2)
self.line_offset = 1 + self.lex.lineno - int(m.group(1)) self.line_offset = 1 + self.lex.lineno - int(m.group(1))
return None
else: @TOKEN(r"\#[\t ]*pragma")
def t_PRAGMA_DIRECTIVE(self, t: LexToken) -> LexToken:
return t return t
@TOKEN(r"\#[\t ]*include (.*)")
def t_INCLUDE_DIRECTIVE(self, t: LexToken) -> LexToken:
return t
@TOKEN(r"\#(.*)")
def t_PP_DIRECTIVE(self, t: LexToken):
# ignore C++23 warning directive
if t.value.startswith("#warning"):
return
if "define" in t.value:
msgtype = "#define"
else:
msgtype = "preprocessor"
self._error(
"cxxheaderparser does not support "
+ msgtype
+ " directives, please use a C++ preprocessor first",
t,
)
t_DIVIDE = r"/(?!/)" t_DIVIDE = r"/(?!/)"
t_ELLIPSIS = r"\.\.\." t_ELLIPSIS = r"\.\.\."
t_DBL_LBRACKET = r"\[\[" t_DBL_LBRACKET = r"\[\["
@ -541,6 +564,12 @@ class TokenStream:
"WHITESPACE", "WHITESPACE",
} }
_discard_types_except_newline = {
"COMMENT_SINGLELINE",
"COMMENT_MULTILINE",
"WHITESPACE",
}
def token(self) -> LexToken: def token(self) -> LexToken:
tokbuf = self.tokbuf tokbuf = self.tokbuf
while True: while True:
@ -563,6 +592,17 @@ class TokenStream:
if not self._fill_tokbuf(tokbuf): if not self._fill_tokbuf(tokbuf):
return None return None
def token_newline_eof_ok(self) -> typing.Optional[LexToken]:
tokbuf = self.tokbuf
while True:
while tokbuf:
tok = tokbuf.popleft()
if tok.type not in self._discard_types_except_newline:
return tok
if not self._fill_tokbuf(tokbuf):
return None
def token_if(self, *types: str) -> typing.Optional[LexToken]: def token_if(self, *types: str) -> typing.Optional[LexToken]:
tok = self.token_eof_ok() tok = self.token_eof_ok()
if tok is None: if tok is None:

View File

@ -304,7 +304,8 @@ class CxxParser:
"{": self._on_empty_block_start, "{": self._on_empty_block_start,
"}": self._on_block_end, "}": self._on_block_end,
"DBL_LBRACKET": self._consume_attribute_specifier_seq, "DBL_LBRACKET": self._consume_attribute_specifier_seq,
"PRECOMP_MACRO": self._process_preprocessor_token, "INCLUDE_DIRECTIVE": self._process_include_directive,
"PRAGMA_DIRECTIVE": self._process_pragma_directive,
";": lambda _1, _2: None, ";": lambda _1, _2: None,
} }
@ -361,20 +362,29 @@ class CxxParser:
_preprocessor_compress_re = re.compile(r"^#[\t ]+") _preprocessor_compress_re = re.compile(r"^#[\t ]+")
_preprocessor_split_re = re.compile(r"[\t ]+") _preprocessor_split_re = re.compile(r"[\t ]+")
def _process_preprocessor_token( def _process_include_directive(self, tok: LexToken, doxygen: typing.Optional[str]):
self, tok: LexToken, doxygen: typing.Optional[str]
) -> None:
value = self._preprocessor_compress_re.sub("#", tok.value) value = self._preprocessor_compress_re.sub("#", tok.value)
svalue = self._preprocessor_split_re.split(value, 1) svalue = self._preprocessor_split_re.split(value, 1)
if len(svalue) == 2: if len(svalue) == 2:
self.state.location = tok.location self.state.location = tok.location
macro = svalue[0].lower().replace(" ", "")
if macro.startswith("#include"):
self.visitor.on_include(self.state, svalue[1]) self.visitor.on_include(self.state, svalue[1])
elif macro.startswith("#define"): else:
self.visitor.on_define(self.state, svalue[1]) raise CxxParseError("incomplete #include directive", tok)
elif macro.startswith("#pragma"):
self.visitor.on_pragma(self.state, svalue[1]) def _process_pragma_directive(self, _: LexToken, doxygen: typing.Optional[str]):
# consume all tokens until the end of the line
# -- but if we find a paren, get the group
tokens: LexTokenList = []
while True:
tok = self.lex.token_newline_eof_ok()
if not tok or tok.type == "NEWLINE":
break
if tok.type in self._balanced_token_map:
tokens.extend(self._consume_balanced_tokens(tok))
else:
tokens.append(tok)
self.visitor.on_pragma(self.state, self._create_value(tokens))
# #
# Various # Various

View File

@ -45,6 +45,7 @@ from .types import (
UsingAlias, UsingAlias,
UsingDecl, UsingDecl,
Variable, Variable,
Value,
) )
from .parserstate import ( from .parserstate import (
@ -123,14 +124,9 @@ class NamespaceScope:
Block = typing.Union[ClassScope, NamespaceScope] Block = typing.Union[ClassScope, NamespaceScope]
@dataclass
class Define:
content: str
@dataclass @dataclass
class Pragma: class Pragma:
content: str content: Value
@dataclass @dataclass
@ -171,9 +167,6 @@ class ParsedData:
#: Global namespace #: Global namespace
namespace: NamespaceScope = field(default_factory=lambda: NamespaceScope()) namespace: NamespaceScope = field(default_factory=lambda: NamespaceScope())
#: Any ``#define`` preprocessor directives encountered
defines: typing.List[Define] = field(default_factory=list)
#: Any ``#pragma`` directives encountered #: Any ``#pragma`` directives encountered
pragmas: typing.List[Pragma] = field(default_factory=list) pragmas: typing.List[Pragma] = field(default_factory=list)
@ -208,10 +201,7 @@ class SimpleCxxVisitor:
self.data = ParsedData(self.namespace) self.data = ParsedData(self.namespace)
def on_define(self, state: State, content: str) -> None: def on_pragma(self, state: State, content: Value) -> None:
self.data.defines.append(Define(content))
def on_pragma(self, state: State, content: str) -> None:
self.data.pragmas.append(Pragma(content)) self.data.pragmas.append(Pragma(content))
def on_include(self, state: State, filename: str) -> None: def on_include(self, state: State, filename: str) -> None:

View File

@ -20,6 +20,7 @@ from .types import (
UsingAlias, UsingAlias,
UsingDecl, UsingDecl,
Variable, Variable,
Value,
) )
from .parserstate import ( from .parserstate import (
@ -36,14 +37,7 @@ class CxxVisitor(Protocol):
Defines the interface used by the parser to emit events Defines the interface used by the parser to emit events
""" """
def on_define(self, state: State, content: str) -> None: def on_pragma(self, state: State, content: Value) -> None:
"""
.. warning:: cxxheaderparser intentionally does not have a C preprocessor
implementation. If you are parsing code with macros in it,
use a conforming preprocessor like ``pcpp``
"""
def on_pragma(self, state: State, content: str) -> None:
""" """
Called once for each ``#pragma`` directive encountered Called once for each ``#pragma`` directive encountered
""" """

View File

@ -20,7 +20,6 @@ from cxxheaderparser.simple import (
Pragma, Pragma,
parse_string, parse_string,
ParsedData, ParsedData,
Define,
) )
# #
@ -28,31 +27,17 @@ from cxxheaderparser.simple import (
# #
def test_define() -> None:
content = """
#define simple
#define complex(thing) stuff(thing)
# define spaced
"""
data = parse_string(content, cleandoc=True)
assert data == ParsedData(
defines=[
Define(content="simple"),
Define(content="complex(thing) stuff(thing)"),
Define(content="spaced"),
],
)
def test_includes() -> None: def test_includes() -> None:
content = """ content = """
#include <global.h> #include <global.h>
#include "local.h" #include "local.h"
# include "space.h"
""" """
data = parse_string(content, cleandoc=True) data = parse_string(content, cleandoc=True)
assert data == ParsedData(includes=[Include("<global.h>"), Include('"local.h"')]) assert data == ParsedData(
includes=[Include("<global.h>"), Include('"local.h"'), Include('"space.h"')]
)
def test_pragma() -> None: def test_pragma() -> None:
@ -63,7 +48,49 @@ def test_pragma() -> None:
""" """
data = parse_string(content, cleandoc=True) data = parse_string(content, cleandoc=True)
assert data == ParsedData(pragmas=[Pragma(content="once")]) assert data == ParsedData(
pragmas=[Pragma(content=Value(tokens=[Token(value="once")]))]
)
def test_pragma_more() -> None:
content = """
#pragma (some content here)
#pragma (even \
more \
content here)
"""
data = parse_string(content, cleandoc=True)
assert data == ParsedData(
pragmas=[
Pragma(
content=Value(
tokens=[
Token(value="("),
Token(value="some"),
Token(value="content"),
Token(value="here"),
Token(value=")"),
]
)
),
Pragma(
content=Value(
tokens=[
Token(value="("),
Token(value="even"),
Token(value="more"),
Token(value="content"),
Token(value="here"),
Token(value=")"),
]
)
),
]
)
# #
@ -294,3 +321,17 @@ def test_line_continuation() -> None:
] ]
) )
) )
#
# #warning (C++23)
#
def test_warning_directive() -> None:
content = """
#warning "this is a warning"
"""
data = parse_string(content, cleandoc=True)
assert data == ParsedData()