Add support for parsing user defined literals

2022-12-15 02:55:07 -05:00 · 2022-12-15 02:55:07 -05:00 · e5295070a0
commit e5295070a0
parent 1eaa85ae8d
3 changed files with 83 additions and 4 deletions
--- a/cxxheaderparser/lexer.py
+++ b/cxxheaderparser/lexer.py
@ -179,6 +179,7 @@ class PlyLexer:
        # misc
        "DIVIDE",
        "NEWLINE",
+        "WHITESPACE",
        "ELLIPSIS",
        "DBL_LBRACKET",
        "DBL_RBRACKET",
@ -329,7 +330,8 @@ class PlyLexer:
        + "[FfLl]?)"
    )

-    t_ignore = " \t\r?@\f"
+    t_WHITESPACE = "[ \t]+"
+    t_ignore = "\r"

    # The following floating and integer constants are defined as
    # functions to impose a strict order (otherwise, decimal
@ -531,7 +533,12 @@ class TokenStream:
        """
        raise NotImplementedError

-    _discard_types = {"NEWLINE", "COMMENT_SINGLELINE", "COMMENT_MULTILINE"}
+    _discard_types = {
+        "NEWLINE",
+        "COMMENT_SINGLELINE",
+        "COMMENT_MULTILINE",
+        "WHITESPACE",
+    }

    def token(self) -> LexToken:
        tokbuf = self.tokbuf
@ -610,6 +617,27 @@ class LexerTokenStream(TokenStream):
    Provides tokens from using PlyLexer on the given input text
    """

+    _user_defined_literal_start = {
+        "FLOAT_CONST",
+        "HEX_FLOAT_CONST",
+        "INT_CONST_HEX",
+        "INT_CONST_BIN",
+        "INT_CONST_OCT",
+        "INT_CONST_DEC",
+        "INT_CONST_CHAR",
+        "CHAR_CONST",
+        "WCHAR_CONST",
+        "U8CHAR_CONST",
+        "U16CHAR_CONST",
+        "U32CHAR_CONST",
+        # String literals
+        "STRING_LITERAL",
+        "WSTRING_LITERAL",
+        "U8STRING_LITERAL",
+        "U16STRING_LITERAL",
+        "U32STRING_LITERAL",
+    }
+
    def __init__(self, filename: typing.Optional[str], content: str) -> None:
        self._lex = PlyLexer(filename)
        self._lex.input(content)
@ -623,6 +651,8 @@ class LexerTokenStream(TokenStream):
        if tok is None:
            return False

+        udl_start = self._user_defined_literal_start
+
        while True:
            tok.location = self._lex.current_location()
            tokbuf.append(tok)
@ -630,6 +660,19 @@ class LexerTokenStream(TokenStream):
            if tok.type == "NEWLINE":
                break

+            # detect/combine user defined literals
+            if tok.type in udl_start:
+                tok2 = get_token()
+                if tok2 is None:
+                    break
+
+                if tok2.type != "NAME" or tok2.value[0] != "_":
+                    tok = tok2
+                    continue
+
+                tok.value = tok.value + tok2.value
+                tok.type = f"UD_{tok.type}"
+
            tok = get_token()
            if tok is None:
                break
@ -659,6 +702,8 @@ class LexerTokenStream(TokenStream):
                tok = tokbuf.popleft()
                if tok.type == "NEWLINE":
                    comments.clear()
+                elif tok.type == "WHITESPACE":
+                    pass
                elif tok.type in ("COMMENT_SINGLELINE", "COMMENT_MULTILINE"):
                    comments.append(tok)
                else:
@ -693,6 +738,8 @@ class LexerTokenStream(TokenStream):
            tok = tokbuf.popleft()
            if tok.type == "NEWLINE":
                break
+            elif tok.type == "WHITESPACE":
+                new_tokbuf.append(tok)
            elif tok.type in ("COMMENT_SINGLELINE", "COMMENT_MULTILINE"):
                comments.append(tok)
            else:
--- a/tests/test_misc.py
+++ b/tests/test_misc.py
@ -236,3 +236,34 @@ def test_final() -> None:
            ],
        )
    )
+
+
+#
+# User defined literals
+#
+
+
+def test_user_defined_literal() -> None:
+    content = """
+      units::volt_t v = 1_V;
+    """
+    data = parse_string(content, cleandoc=True)
+
+    assert data == ParsedData(
+        namespace=NamespaceScope(
+            variables=[
+                Variable(
+                    name=PQName(segments=[NameSpecifier(name="v")]),
+                    type=Type(
+                        typename=PQName(
+                            segments=[
+                                NameSpecifier(name="units"),
+                                NameSpecifier(name="volt_t"),
+                            ]
+                        )
+                    ),
+                    value=Value(tokens=[Token(value="1_V")]),
+                )
+            ]
+        )
+    )
--- a/tests/test_tokfmt.py
+++ b/tests/test_tokfmt.py
@ -1,6 +1,6 @@
 import pytest

-from cxxheaderparser.lexer import PlyLexer
+from cxxheaderparser.lexer import PlyLexer, LexerTokenStream
 from cxxheaderparser.tokfmt import tokfmt
 from cxxheaderparser.types import Token

@ -48,6 +48,7 @@ def test_tokfmt(instr: str) -> None:
        if not tok:
            break

+        if tok.type not in LexerTokenStream._discard_types:
            toks.append(Token(tok.value, tok.type))

    assert tokfmt(toks) == instr