Restructure preprocessor support

- Remove partial support for #define and other PP directives - Allow pragma to span multiple lines - Pragma now emits a list of tokens instead of a single string - Ignore #warning directive if present
2023-07-23 17:03:04 -04:00 · 2023-07-23 17:03:04 -04:00 · bd9907ad79
commit bd9907ad79
parent b07e1f81a6
5 changed files with 136 additions and 61 deletions
--- a/cxxheaderparser/lexer.py
+++ b/cxxheaderparser/lexer.py
@ -51,6 +51,7 @@ class LexToken(Protocol):
    #: private
    lexer: lex.Lexer
    lexmatch: "re.Match"
 PhonyEnding: LexToken = lex.LexToken()  # type: ignore
@ -175,7 +176,10 @@ class PlyLexer:
        # Comments
        "COMMENT_SINGLELINE",
        "COMMENT_MULTILINE",
-        "PRECOMP_MACRO",
+        "LINE_DIRECTIVE",
        "PRAGMA_DIRECTIVE",
        "INCLUDE_DIRECTIVE",
        "PP_DIRECTIVE",
        # misc
        "DIVIDE",
        "NEWLINE",
@ -434,17 +438,36 @@ class PlyLexer:
            t.type = t.value
        return t
-    @TOKEN(r"\#.*")
+    @TOKEN(r'\#[\t ]*line (\d+) "(.*)"')
-    def t_PRECOMP_MACRO(self, t: LexToken) -> typing.Optional[LexToken]:
+    def t_LINE_DIRECTIVE(self, t: LexToken) -> None:
-        m = _line_re.match(t.value)
+        m = t.lexmatch
        if m:
        self.filename = m.group(2)
        self.line_offset = 1 + self.lex.lineno - int(m.group(1))
-            return None
+
-        else:
+    @TOKEN(r"\#[\t ]*pragma")
    def t_PRAGMA_DIRECTIVE(self, t: LexToken) -> LexToken:
        return t
    @TOKEN(r"\#[\t ]*include (.*)")
    def t_INCLUDE_DIRECTIVE(self, t: LexToken) -> LexToken:
        return t
    @TOKEN(r"\#(.*)")
    def t_PP_DIRECTIVE(self, t: LexToken):
        # ignore C++23 warning directive
        if t.value.startswith("#warning"):
            return
        if "define" in t.value:
            msgtype = "#define"
        else:
            msgtype = "preprocessor"
        self._error(
            "cxxheaderparser does not support "
            + msgtype
            + " directives, please use a C++ preprocessor first",
            t,
        )
    t_DIVIDE = r"/(?!/)"
    t_ELLIPSIS = r"\.\.\."
    t_DBL_LBRACKET = r"\[\["
@ -541,6 +564,12 @@ class TokenStream:
        "WHITESPACE",
    }
    _discard_types_except_newline = {
        "COMMENT_SINGLELINE",
        "COMMENT_MULTILINE",
        "WHITESPACE",
    }
    def token(self) -> LexToken:
        tokbuf = self.tokbuf
        while True:
@ -563,6 +592,17 @@ class TokenStream:
            if not self._fill_tokbuf(tokbuf):
                return None
    def token_newline_eof_ok(self) -> typing.Optional[LexToken]:
        tokbuf = self.tokbuf
        while True:
            while tokbuf:
                tok = tokbuf.popleft()
                if tok.type not in self._discard_types_except_newline:
                    return tok
            if not self._fill_tokbuf(tokbuf):
                return None
    def token_if(self, *types: str) -> typing.Optional[LexToken]:
        tok = self.token_eof_ok()
        if tok is None:
--- a/cxxheaderparser/parser.py
+++ b/cxxheaderparser/parser.py
@ -304,7 +304,8 @@ class CxxParser:
            "{": self._on_empty_block_start,
            "}": self._on_block_end,
            "DBL_LBRACKET": self._consume_attribute_specifier_seq,
-            "PRECOMP_MACRO": self._process_preprocessor_token,
+            "INCLUDE_DIRECTIVE": self._process_include_directive,
            "PRAGMA_DIRECTIVE": self._process_pragma_directive,
            ";": lambda _1, _2: None,
        }
@ -361,20 +362,29 @@ class CxxParser:
    _preprocessor_compress_re = re.compile(r"^#[\t ]+")
    _preprocessor_split_re = re.compile(r"[\t ]+")
-    def _process_preprocessor_token(
+    def _process_include_directive(self, tok: LexToken, doxygen: typing.Optional[str]):
        self, tok: LexToken, doxygen: typing.Optional[str]
    ) -> None:
        value = self._preprocessor_compress_re.sub("#", tok.value)
        svalue = self._preprocessor_split_re.split(value, 1)
        if len(svalue) == 2:
            self.state.location = tok.location
            macro = svalue[0].lower().replace(" ", "")
            if macro.startswith("#include"):
            self.visitor.on_include(self.state, svalue[1])
-            elif macro.startswith("#define"):
+        else:
-                self.visitor.on_define(self.state, svalue[1])
+            raise CxxParseError("incomplete #include directive", tok)
-            elif macro.startswith("#pragma"):
+
-                self.visitor.on_pragma(self.state, svalue[1])
+    def _process_pragma_directive(self, _: LexToken, doxygen: typing.Optional[str]):
        # consume all tokens until the end of the line
        # -- but if we find a paren, get the group
        tokens: LexTokenList = []
        while True:
            tok = self.lex.token_newline_eof_ok()
            if not tok or tok.type == "NEWLINE":
                break
            if tok.type in self._balanced_token_map:
                tokens.extend(self._consume_balanced_tokens(tok))
            else:
                tokens.append(tok)
        self.visitor.on_pragma(self.state, self._create_value(tokens))
    #
    # Various
--- a/cxxheaderparser/simple.py
+++ b/cxxheaderparser/simple.py
@ -45,6 +45,7 @@ from .types import (
    UsingAlias,
    UsingDecl,
    Variable,
    Value,
 )
 from .parserstate import (
@ -123,14 +124,9 @@ class NamespaceScope:
 Block = typing.Union[ClassScope, NamespaceScope]
@dataclass
 class Define:
    content: str
@dataclass
 class Pragma:
-    content: str
+    content: Value
@dataclass
@ -171,9 +167,6 @@ class ParsedData:
    #: Global namespace
    namespace: NamespaceScope = field(default_factory=lambda: NamespaceScope())
    #: Any ``#define`` preprocessor directives encountered
    defines: typing.List[Define] = field(default_factory=list)
    #: Any ``#pragma`` directives encountered
    pragmas: typing.List[Pragma] = field(default_factory=list)
@ -208,10 +201,7 @@ class SimpleCxxVisitor:
        self.data = ParsedData(self.namespace)
-    def on_define(self, state: State, content: str) -> None:
+    def on_pragma(self, state: State, content: Value) -> None:
        self.data.defines.append(Define(content))
    def on_pragma(self, state: State, content: str) -> None:
        self.data.pragmas.append(Pragma(content))
    def on_include(self, state: State, filename: str) -> None:
--- a/cxxheaderparser/visitor.py
+++ b/cxxheaderparser/visitor.py
@ -20,6 +20,7 @@ from .types import (
    UsingAlias,
    UsingDecl,
    Variable,
    Value,
 )
 from .parserstate import (
@ -36,14 +37,7 @@ class CxxVisitor(Protocol):
    Defines the interface used by the parser to emit events
    """
-    def on_define(self, state: State, content: str) -> None:
+    def on_pragma(self, state: State, content: Value) -> None:
        """
        .. warning:: cxxheaderparser intentionally does not have a C preprocessor
                     implementation. If you are parsing code with macros in it,
                     use a conforming preprocessor like ``pcpp``
        """
    def on_pragma(self, state: State, content: str) -> None:
        """
        Called once for each ``#pragma`` directive encountered
        """
--- a/tests/test_misc.py
+++ b/tests/test_misc.py
@ -20,7 +20,6 @@ from cxxheaderparser.simple import (
    Pragma,
    parse_string,
    ParsedData,
    Define,
 )
 #
@ -28,31 +27,17 @@ from cxxheaderparser.simple import (
 #
 def test_define() -> None:
    content = """
        #define simple
        #define complex(thing) stuff(thing)
        #  define spaced
    """
    data = parse_string(content, cleandoc=True)
    assert data == ParsedData(
        defines=[
            Define(content="simple"),
            Define(content="complex(thing) stuff(thing)"),
            Define(content="spaced"),
        ],
    )
 def test_includes() -> None:
    content = """
        #include <global.h>
        #include "local.h"
        # include  "space.h"
    """
    data = parse_string(content, cleandoc=True)
-    assert data == ParsedData(includes=[Include("<global.h>"), Include('"local.h"')])
+    assert data == ParsedData(
        includes=[Include("<global.h>"), Include('"local.h"'), Include('"space.h"')]
    )
 def test_pragma() -> None:
@ -63,7 +48,49 @@ def test_pragma() -> None:
    """
    data = parse_string(content, cleandoc=True)
-    assert data == ParsedData(pragmas=[Pragma(content="once")])
+    assert data == ParsedData(
        pragmas=[Pragma(content=Value(tokens=[Token(value="once")]))]
    )
 def test_pragma_more() -> None:
    content = """
        #pragma (some content here)
        #pragma (even \
            more \
            content here)
    """
    data = parse_string(content, cleandoc=True)
    assert data == ParsedData(
        pragmas=[
            Pragma(
                content=Value(
                    tokens=[
                        Token(value="("),
                        Token(value="some"),
                        Token(value="content"),
                        Token(value="here"),
                        Token(value=")"),
                    ]
                )
            ),
            Pragma(
                content=Value(
                    tokens=[
                        Token(value="("),
                        Token(value="even"),
                        Token(value="more"),
                        Token(value="content"),
                        Token(value="here"),
                        Token(value=")"),
                    ]
                )
            ),
        ]
    )
 #
@ -294,3 +321,17 @@ def test_line_continuation() -> None:
            ]
        )
    )
 #
 # #warning (C++23)
 #
 def test_warning_directive() -> None:
    content = """
        #warning "this is a warning"
    """
    data = parse_string(content, cleandoc=True)
    assert data == ParsedData()