Initial commit

This commit is contained in:
Dustin Spicuzza
2020-12-28 03:35:30 -05:00
commit ef5c22972b
37 changed files with 14826 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
try:
from .version import __version__
except ImportError:
__version__ = "master"

View File

@@ -0,0 +1,4 @@
from cxxheaderparser.dump import dumpmain
if __name__ == "__main__":
dumpmain()

View File

902
cxxheaderparser/_ply/lex.py Normal file
View File

@@ -0,0 +1,902 @@
# fmt: off
# -----------------------------------------------------------------------------
# ply: lex.py
#
# Copyright (C) 2001-2020
# David M. Beazley (Dabeaz LLC)
# All rights reserved.
#
# Latest version: https://github.com/dabeaz/ply
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * Neither the name of David Beazley or Dabeaz LLC may be used to
# endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# -----------------------------------------------------------------------------
import re
import sys
import types
import copy
import os
import inspect
# This tuple contains acceptable string types
StringTypes = (str, bytes)
# This regular expression is used to match valid token names
_is_identifier = re.compile(r'^[a-zA-Z0-9_]+$')
# Exception thrown when invalid token encountered and no default error
# handler is defined.
class LexError(Exception):
def __init__(self, message, s):
self.args = (message,)
self.text = s
# Token class. This class is used to represent the tokens produced.
class LexToken(object):
def __repr__(self):
return f'LexToken({self.type},{self.value!r},{self.lineno},{self.lexpos})'
# This object is a stand-in for a logging object created by the
# logging module.
class PlyLogger(object):
def __init__(self, f):
self.f = f
def critical(self, msg, *args, **kwargs):
self.f.write((msg % args) + '\n')
def warning(self, msg, *args, **kwargs):
self.f.write('WARNING: ' + (msg % args) + '\n')
def error(self, msg, *args, **kwargs):
self.f.write('ERROR: ' + (msg % args) + '\n')
info = critical
debug = critical
# -----------------------------------------------------------------------------
# === Lexing Engine ===
#
# The following Lexer class implements the lexer runtime. There are only
# a few public methods and attributes:
#
# input() - Store a new string in the lexer
# token() - Get the next token
# clone() - Clone the lexer
#
# lineno - Current line number
# lexpos - Current position in the input string
# -----------------------------------------------------------------------------
class Lexer:
def __init__(self):
self.lexre = None # Master regular expression. This is a list of
# tuples (re, findex) where re is a compiled
# regular expression and findex is a list
# mapping regex group numbers to rules
self.lexretext = None # Current regular expression strings
self.lexstatere = {} # Dictionary mapping lexer states to master regexs
self.lexstateretext = {} # Dictionary mapping lexer states to regex strings
self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names
self.lexstate = 'INITIAL' # Current lexer state
self.lexstatestack = [] # Stack of lexer states
self.lexstateinfo = None # State information
self.lexstateignore = {} # Dictionary of ignored characters for each state
self.lexstateerrorf = {} # Dictionary of error functions for each state
self.lexstateeoff = {} # Dictionary of eof functions for each state
self.lexreflags = 0 # Optional re compile flags
self.lexdata = None # Actual input data (as a string)
self.lexpos = 0 # Current position in input text
self.lexlen = 0 # Length of the input text
self.lexerrorf = None # Error rule (if any)
self.lexeoff = None # EOF rule (if any)
self.lextokens = None # List of valid tokens
self.lexignore = '' # Ignored characters
self.lexliterals = '' # Literal characters that can be passed through
self.lexmodule = None # Module
self.lineno = 1 # Current line number
def clone(self, object=None):
c = copy.copy(self)
# If the object parameter has been supplied, it means we are attaching the
# lexer to a new object. In this case, we have to rebind all methods in
# the lexstatere and lexstateerrorf tables.
if object:
newtab = {}
for key, ritem in self.lexstatere.items():
newre = []
for cre, findex in ritem:
newfindex = []
for f in findex:
if not f or not f[0]:
newfindex.append(f)
continue
newfindex.append((getattr(object, f[0].__name__), f[1]))
newre.append((cre, newfindex))
newtab[key] = newre
c.lexstatere = newtab
c.lexstateerrorf = {}
for key, ef in self.lexstateerrorf.items():
c.lexstateerrorf[key] = getattr(object, ef.__name__)
c.lexmodule = object
return c
# ------------------------------------------------------------
# input() - Push a new string into the lexer
# ------------------------------------------------------------
def input(self, s):
self.lexdata = s
self.lexpos = 0
self.lexlen = len(s)
# ------------------------------------------------------------
# begin() - Changes the lexing state
# ------------------------------------------------------------
def begin(self, state):
if state not in self.lexstatere:
raise ValueError(f'Undefined state {state!r}')
self.lexre = self.lexstatere[state]
self.lexretext = self.lexstateretext[state]
self.lexignore = self.lexstateignore.get(state, '')
self.lexerrorf = self.lexstateerrorf.get(state, None)
self.lexeoff = self.lexstateeoff.get(state, None)
self.lexstate = state
# ------------------------------------------------------------
# push_state() - Changes the lexing state and saves old on stack
# ------------------------------------------------------------
def push_state(self, state):
self.lexstatestack.append(self.lexstate)
self.begin(state)
# ------------------------------------------------------------
# pop_state() - Restores the previous state
# ------------------------------------------------------------
def pop_state(self):
self.begin(self.lexstatestack.pop())
# ------------------------------------------------------------
# current_state() - Returns the current lexing state
# ------------------------------------------------------------
def current_state(self):
return self.lexstate
# ------------------------------------------------------------
# skip() - Skip ahead n characters
# ------------------------------------------------------------
def skip(self, n):
self.lexpos += n
# ------------------------------------------------------------
# token() - Return the next token from the Lexer
#
# Note: This function has been carefully implemented to be as fast
# as possible. Don't make changes unless you really know what
# you are doing
# ------------------------------------------------------------
def token(self):
# Make local copies of frequently referenced attributes
lexpos = self.lexpos
lexlen = self.lexlen
lexignore = self.lexignore
lexdata = self.lexdata
while lexpos < lexlen:
# This code provides some short-circuit code for whitespace, tabs, and other ignored characters
if lexdata[lexpos] in lexignore:
lexpos += 1
continue
# Look for a regular expression match
for lexre, lexindexfunc in self.lexre:
m = lexre.match(lexdata, lexpos)
if not m:
continue
# Create a token for return
tok = LexToken()
tok.value = m.group()
tok.lineno = self.lineno
tok.lexpos = lexpos
i = m.lastindex
func, tok.type = lexindexfunc[i]
if not func:
# If no token type was set, it's an ignored token
if tok.type:
self.lexpos = m.end()
return tok
else:
lexpos = m.end()
break
lexpos = m.end()
# If token is processed by a function, call it
tok.lexer = self # Set additional attributes useful in token rules
self.lexmatch = m
self.lexpos = lexpos
newtok = func(tok)
del tok.lexer
del self.lexmatch
# Every function must return a token, if nothing, we just move to next token
if not newtok:
lexpos = self.lexpos # This is here in case user has updated lexpos.
lexignore = self.lexignore # This is here in case there was a state change
break
return newtok
else:
# No match, see if in literals
if lexdata[lexpos] in self.lexliterals:
tok = LexToken()
tok.value = lexdata[lexpos]
tok.lineno = self.lineno
tok.type = tok.value
tok.lexpos = lexpos
self.lexpos = lexpos + 1
return tok
# No match. Call t_error() if defined.
if self.lexerrorf:
tok = LexToken()
tok.value = self.lexdata[lexpos:]
tok.lineno = self.lineno
tok.type = 'error'
tok.lexer = self
tok.lexpos = lexpos
self.lexpos = lexpos
newtok = self.lexerrorf(tok)
if lexpos == self.lexpos:
# Error method didn't change text position at all. This is an error.
raise LexError(f"Scanning error. Illegal character {lexdata[lexpos]!r}",
lexdata[lexpos:])
lexpos = self.lexpos
if not newtok:
continue
return newtok
self.lexpos = lexpos
raise LexError(f"Illegal character {lexdata[lexpos]!r} at index {lexpos}",
lexdata[lexpos:])
if self.lexeoff:
tok = LexToken()
tok.type = 'eof'
tok.value = ''
tok.lineno = self.lineno
tok.lexpos = lexpos
tok.lexer = self
self.lexpos = lexpos
newtok = self.lexeoff(tok)
return newtok
self.lexpos = lexpos + 1
if self.lexdata is None:
raise RuntimeError('No input string given with input()')
return None
# Iterator interface
def __iter__(self):
return self
def __next__(self):
t = self.token()
if t is None:
raise StopIteration
return t
# -----------------------------------------------------------------------------
# ==== Lex Builder ===
#
# The functions and classes below are used to collect lexing information
# and build a Lexer object from it.
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
# _get_regex(func)
#
# Returns the regular expression assigned to a function either as a doc string
# or as a .regex attribute attached by the @TOKEN decorator.
# -----------------------------------------------------------------------------
def _get_regex(func):
return getattr(func, 'regex', func.__doc__)
# -----------------------------------------------------------------------------
# get_caller_module_dict()
#
# This function returns a dictionary containing all of the symbols defined within
# a caller further down the call stack. This is used to get the environment
# associated with the yacc() call if none was provided.
# -----------------------------------------------------------------------------
def get_caller_module_dict(levels):
f = sys._getframe(levels)
return { **f.f_globals, **f.f_locals }
# -----------------------------------------------------------------------------
# _form_master_re()
#
# This function takes a list of all of the regex components and attempts to
# form the master regular expression. Given limitations in the Python re
# module, it may be necessary to break the master regex into separate expressions.
# -----------------------------------------------------------------------------
def _form_master_re(relist, reflags, ldict, toknames):
if not relist:
return [], [], []
regex = '|'.join(relist)
try:
lexre = re.compile(regex, reflags)
# Build the index to function map for the matching engine
lexindexfunc = [None] * (max(lexre.groupindex.values()) + 1)
lexindexnames = lexindexfunc[:]
for f, i in lexre.groupindex.items():
handle = ldict.get(f, None)
if type(handle) in (types.FunctionType, types.MethodType):
lexindexfunc[i] = (handle, toknames[f])
lexindexnames[i] = f
elif handle is not None:
lexindexnames[i] = f
if f.find('ignore_') > 0:
lexindexfunc[i] = (None, None)
else:
lexindexfunc[i] = (None, toknames[f])
return [(lexre, lexindexfunc)], [regex], [lexindexnames]
except Exception:
m = (len(relist) // 2) + 1
llist, lre, lnames = _form_master_re(relist[:m], reflags, ldict, toknames)
rlist, rre, rnames = _form_master_re(relist[m:], reflags, ldict, toknames)
return (llist+rlist), (lre+rre), (lnames+rnames)
# -----------------------------------------------------------------------------
# def _statetoken(s,names)
#
# Given a declaration name s of the form "t_" and a dictionary whose keys are
# state names, this function returns a tuple (states,tokenname) where states
# is a tuple of state names and tokenname is the name of the token. For example,
# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM')
# -----------------------------------------------------------------------------
def _statetoken(s, names):
parts = s.split('_')
for i, part in enumerate(parts[1:], 1):
if part not in names and part != 'ANY':
break
if i > 1:
states = tuple(parts[1:i])
else:
states = ('INITIAL',)
if 'ANY' in states:
states = tuple(names)
tokenname = '_'.join(parts[i:])
return (states, tokenname)
# -----------------------------------------------------------------------------
# LexerReflect()
#
# This class represents information needed to build a lexer as extracted from a
# user's input file.
# -----------------------------------------------------------------------------
class LexerReflect(object):
def __init__(self, ldict, log=None, reflags=0):
self.ldict = ldict
self.error_func = None
self.tokens = []
self.reflags = reflags
self.stateinfo = {'INITIAL': 'inclusive'}
self.modules = set()
self.error = False
self.log = PlyLogger(sys.stderr) if log is None else log
# Get all of the basic information
def get_all(self):
self.get_tokens()
self.get_literals()
self.get_states()
self.get_rules()
# Validate all of the information
def validate_all(self):
self.validate_tokens()
self.validate_literals()
self.validate_rules()
return self.error
# Get the tokens map
def get_tokens(self):
tokens = self.ldict.get('tokens', None)
if not tokens:
self.log.error('No token list is defined')
self.error = True
return
if not isinstance(tokens, (list, tuple)):
self.log.error('tokens must be a list or tuple')
self.error = True
return
if not tokens:
self.log.error('tokens is empty')
self.error = True
return
self.tokens = tokens
# Validate the tokens
def validate_tokens(self):
terminals = {}
for n in self.tokens:
if not _is_identifier.match(n):
self.log.error(f"Bad token name {n!r}")
self.error = True
if n in terminals:
self.log.warning(f"Token {n!r} multiply defined")
terminals[n] = 1
# Get the literals specifier
def get_literals(self):
self.literals = self.ldict.get('literals', '')
if not self.literals:
self.literals = ''
# Validate literals
def validate_literals(self):
try:
for c in self.literals:
if not isinstance(c, StringTypes) or len(c) > 1:
self.log.error(f'Invalid literal {c!r}. Must be a single character')
self.error = True
except TypeError:
self.log.error('Invalid literals specification. literals must be a sequence of characters')
self.error = True
def get_states(self):
self.states = self.ldict.get('states', None)
# Build statemap
if self.states:
if not isinstance(self.states, (tuple, list)):
self.log.error('states must be defined as a tuple or list')
self.error = True
else:
for s in self.states:
if not isinstance(s, tuple) or len(s) != 2:
self.log.error("Invalid state specifier %r. Must be a tuple (statename,'exclusive|inclusive')", s)
self.error = True
continue
name, statetype = s
if not isinstance(name, StringTypes):
self.log.error('State name %r must be a string', name)
self.error = True
continue
if not (statetype == 'inclusive' or statetype == 'exclusive'):
self.log.error("State type for state %r must be 'inclusive' or 'exclusive'", name)
self.error = True
continue
if name in self.stateinfo:
self.log.error("State %r already defined", name)
self.error = True
continue
self.stateinfo[name] = statetype
# Get all of the symbols with a t_ prefix and sort them into various
# categories (functions, strings, error functions, and ignore characters)
def get_rules(self):
tsymbols = [f for f in self.ldict if f[:2] == 't_']
# Now build up a list of functions and a list of strings
self.toknames = {} # Mapping of symbols to token names
self.funcsym = {} # Symbols defined as functions
self.strsym = {} # Symbols defined as strings
self.ignore = {} # Ignore strings by state
self.errorf = {} # Error functions by state
self.eoff = {} # EOF functions by state
for s in self.stateinfo:
self.funcsym[s] = []
self.strsym[s] = []
if len(tsymbols) == 0:
self.log.error('No rules of the form t_rulename are defined')
self.error = True
return
for f in tsymbols:
t = self.ldict[f]
states, tokname = _statetoken(f, self.stateinfo)
self.toknames[f] = tokname
if hasattr(t, '__call__'):
if tokname == 'error':
for s in states:
self.errorf[s] = t
elif tokname == 'eof':
for s in states:
self.eoff[s] = t
elif tokname == 'ignore':
line = t.__code__.co_firstlineno
file = t.__code__.co_filename
self.log.error("%s:%d: Rule %r must be defined as a string", file, line, t.__name__)
self.error = True
else:
for s in states:
self.funcsym[s].append((f, t))
elif isinstance(t, StringTypes):
if tokname == 'ignore':
for s in states:
self.ignore[s] = t
if '\\' in t:
self.log.warning("%s contains a literal backslash '\\'", f)
elif tokname == 'error':
self.log.error("Rule %r must be defined as a function", f)
self.error = True
else:
for s in states:
self.strsym[s].append((f, t))
else:
self.log.error('%s not defined as a function or string', f)
self.error = True
# Sort the functions by line number
for f in self.funcsym.values():
f.sort(key=lambda x: x[1].__code__.co_firstlineno)
# Sort the strings by regular expression length
for s in self.strsym.values():
s.sort(key=lambda x: len(x[1]), reverse=True)
# Validate all of the t_rules collected
def validate_rules(self):
for state in self.stateinfo:
# Validate all rules defined by functions
for fname, f in self.funcsym[state]:
line = f.__code__.co_firstlineno
file = f.__code__.co_filename
module = inspect.getmodule(f)
self.modules.add(module)
tokname = self.toknames[fname]
if isinstance(f, types.MethodType):
reqargs = 2
else:
reqargs = 1
nargs = f.__code__.co_argcount
if nargs > reqargs:
self.log.error("%s:%d: Rule %r has too many arguments", file, line, f.__name__)
self.error = True
continue
if nargs < reqargs:
self.log.error("%s:%d: Rule %r requires an argument", file, line, f.__name__)
self.error = True
continue
if not _get_regex(f):
self.log.error("%s:%d: No regular expression defined for rule %r", file, line, f.__name__)
self.error = True
continue
try:
c = re.compile('(?P<%s>%s)' % (fname, _get_regex(f)), self.reflags)
if c.match(''):
self.log.error("%s:%d: Regular expression for rule %r matches empty string", file, line, f.__name__)
self.error = True
except re.error as e:
self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file, line, f.__name__, e)
if '#' in _get_regex(f):
self.log.error("%s:%d. Make sure '#' in rule %r is escaped with '\\#'", file, line, f.__name__)
self.error = True
# Validate all rules defined by strings
for name, r in self.strsym[state]:
tokname = self.toknames[name]
if tokname == 'error':
self.log.error("Rule %r must be defined as a function", name)
self.error = True
continue
if tokname not in self.tokens and tokname.find('ignore_') < 0:
self.log.error("Rule %r defined for an unspecified token %s", name, tokname)
self.error = True
continue
try:
c = re.compile('(?P<%s>%s)' % (name, r), self.reflags)
if (c.match('')):
self.log.error("Regular expression for rule %r matches empty string", name)
self.error = True
except re.error as e:
self.log.error("Invalid regular expression for rule %r. %s", name, e)
if '#' in r:
self.log.error("Make sure '#' in rule %r is escaped with '\\#'", name)
self.error = True
if not self.funcsym[state] and not self.strsym[state]:
self.log.error("No rules defined for state %r", state)
self.error = True
# Validate the error function
efunc = self.errorf.get(state, None)
if efunc:
f = efunc
line = f.__code__.co_firstlineno
file = f.__code__.co_filename
module = inspect.getmodule(f)
self.modules.add(module)
if isinstance(f, types.MethodType):
reqargs = 2
else:
reqargs = 1
nargs = f.__code__.co_argcount
if nargs > reqargs:
self.log.error("%s:%d: Rule %r has too many arguments", file, line, f.__name__)
self.error = True
if nargs < reqargs:
self.log.error("%s:%d: Rule %r requires an argument", file, line, f.__name__)
self.error = True
for module in self.modules:
self.validate_module(module)
# -----------------------------------------------------------------------------
# validate_module()
#
# This checks to see if there are duplicated t_rulename() functions or strings
# in the parser input file. This is done using a simple regular expression
# match on each line in the source code of the given module.
# -----------------------------------------------------------------------------
def validate_module(self, module):
try:
lines, linen = inspect.getsourcelines(module)
except IOError:
return
fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=')
counthash = {}
linen += 1
for line in lines:
m = fre.match(line)
if not m:
m = sre.match(line)
if m:
name = m.group(1)
prev = counthash.get(name)
if not prev:
counthash[name] = linen
else:
filename = inspect.getsourcefile(module)
self.log.error('%s:%d: Rule %s redefined. Previously defined on line %d', filename, linen, name, prev)
self.error = True
linen += 1
# -----------------------------------------------------------------------------
# lex(module)
#
# Build all of the regular expression rules from definitions in the supplied module
# -----------------------------------------------------------------------------
def lex(*, module=None, object=None, debug=False,
reflags=int(re.VERBOSE), debuglog=None, errorlog=None):
global lexer
ldict = None
stateinfo = {'INITIAL': 'inclusive'}
lexobj = Lexer()
global token, input
if errorlog is None:
errorlog = PlyLogger(sys.stderr)
if debug:
if debuglog is None:
debuglog = PlyLogger(sys.stderr)
# Get the module dictionary used for the lexer
if object:
module = object
# Get the module dictionary used for the parser
if module:
_items = [(k, getattr(module, k)) for k in dir(module)]
ldict = dict(_items)
# If no __file__ attribute is available, try to obtain it from the __module__ instead
if '__file__' not in ldict:
ldict['__file__'] = sys.modules[ldict['__module__']].__file__
else:
ldict = get_caller_module_dict(2)
# Collect parser information from the dictionary
linfo = LexerReflect(ldict, log=errorlog, reflags=reflags)
linfo.get_all()
if linfo.validate_all():
raise SyntaxError("Can't build lexer")
# Dump some basic debugging information
if debug:
debuglog.info('lex: tokens = %r', linfo.tokens)
debuglog.info('lex: literals = %r', linfo.literals)
debuglog.info('lex: states = %r', linfo.stateinfo)
# Build a dictionary of valid token names
lexobj.lextokens = set()
for n in linfo.tokens:
lexobj.lextokens.add(n)
# Get literals specification
if isinstance(linfo.literals, (list, tuple)):
lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals)
else:
lexobj.lexliterals = linfo.literals
lexobj.lextokens_all = lexobj.lextokens | set(lexobj.lexliterals)
# Get the stateinfo dictionary
stateinfo = linfo.stateinfo
regexs = {}
# Build the master regular expressions
for state in stateinfo:
regex_list = []
# Add rules defined by functions first
for fname, f in linfo.funcsym[state]:
regex_list.append('(?P<%s>%s)' % (fname, _get_regex(f)))
if debug:
debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", fname, _get_regex(f), state)
# Now add all of the simple rules
for name, r in linfo.strsym[state]:
regex_list.append('(?P<%s>%s)' % (name, r))
if debug:
debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", name, r, state)
regexs[state] = regex_list
# Build the master regular expressions
if debug:
debuglog.info('lex: ==== MASTER REGEXS FOLLOW ====')
for state in regexs:
lexre, re_text, re_names = _form_master_re(regexs[state], reflags, ldict, linfo.toknames)
lexobj.lexstatere[state] = lexre
lexobj.lexstateretext[state] = re_text
lexobj.lexstaterenames[state] = re_names
if debug:
for i, text in enumerate(re_text):
debuglog.info("lex: state '%s' : regex[%d] = '%s'", state, i, text)
# For inclusive states, we need to add the regular expressions from the INITIAL state
for state, stype in stateinfo.items():
if state != 'INITIAL' and stype == 'inclusive':
lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL'])
lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL'])
lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL'])
lexobj.lexstateinfo = stateinfo
lexobj.lexre = lexobj.lexstatere['INITIAL']
lexobj.lexretext = lexobj.lexstateretext['INITIAL']
lexobj.lexreflags = reflags
# Set up ignore variables
lexobj.lexstateignore = linfo.ignore
lexobj.lexignore = lexobj.lexstateignore.get('INITIAL', '')
# Set up error functions
lexobj.lexstateerrorf = linfo.errorf
lexobj.lexerrorf = linfo.errorf.get('INITIAL', None)
if not lexobj.lexerrorf:
errorlog.warning('No t_error rule is defined')
# Set up eof functions
lexobj.lexstateeoff = linfo.eoff
lexobj.lexeoff = linfo.eoff.get('INITIAL', None)
# Check state information for ignore and error rules
for s, stype in stateinfo.items():
if stype == 'exclusive':
if s not in linfo.errorf:
errorlog.warning("No error rule is defined for exclusive state %r", s)
if s not in linfo.ignore and lexobj.lexignore:
errorlog.warning("No ignore rule is defined for exclusive state %r", s)
elif stype == 'inclusive':
if s not in linfo.errorf:
linfo.errorf[s] = linfo.errorf.get('INITIAL', None)
if s not in linfo.ignore:
linfo.ignore[s] = linfo.ignore.get('INITIAL', '')
# Create global versions of the token() and input() functions
token = lexobj.token
input = lexobj.input
lexer = lexobj
return lexobj
# -----------------------------------------------------------------------------
# runmain()
#
# This runs the lexer as a main program
# -----------------------------------------------------------------------------
def runmain(lexer=None, data=None):
if not data:
try:
filename = sys.argv[1]
with open(filename) as f:
data = f.read()
except IndexError:
sys.stdout.write('Reading from standard input (type EOF to end):\n')
data = sys.stdin.read()
if lexer:
_input = lexer.input
else:
_input = input
_input(data)
if lexer:
_token = lexer.token
else:
_token = token
while True:
tok = _token()
if not tok:
break
sys.stdout.write(f'({tok.type},{tok.value!r},{tok.lineno},{tok.lexpos})\n')
# -----------------------------------------------------------------------------
# @TOKEN(regex)
#
# This decorator function can be used to set the regex expression on a function
# when its docstring might need to be set in an alternative way
# -----------------------------------------------------------------------------
def TOKEN(r):
def set_regex(f):
if hasattr(r, '__call__'):
f.regex = _get_regex(r)
else:
f.regex = r
return f
return set_regex

53
cxxheaderparser/dump.py Normal file
View File

@@ -0,0 +1,53 @@
import argparse
import dataclasses
import json
import pprint
import subprocess
import sys
from .options import ParserOptions
from .simple import parse_file
def dumpmain():
parser = argparse.ArgumentParser()
parser.add_argument("header")
parser.add_argument(
"-w",
"--width",
default=80,
type=int,
help="Width of output when in pprint mode",
)
parser.add_argument("-v", "--verbose", default=False, action="store_true")
parser.add_argument(
"--mode", choices=["json", "pprint", "repr", "brepr"], default="pprint"
)
args = parser.parse_args()
options = ParserOptions(verbose=args.verbose)
data = parse_file(args.header, options=options)
if args.mode == "pprint":
ddata = dataclasses.asdict(data)
pprint.pprint(ddata, width=args.width, compact=True)
elif args.mode == "json":
ddata = dataclasses.asdict(data)
json.dump(ddata, sys.stdout, indent=2)
elif args.mode == "brepr":
stmt = repr(data)
stmt = subprocess.check_output(
["black", "-", "-q"], input=stmt.encode("utf-8")
).decode("utf-8")
print(stmt)
elif args.mode == "repr":
print(data)
else:
parser.error("Invalid mode")

12
cxxheaderparser/errors.py Normal file
View File

@@ -0,0 +1,12 @@
import typing
from .lexer import LexToken
class CxxParseError(Exception):
"""
Exception raised when a parsing error occurs
"""
def __init__(self, msg: str, tok: typing.Optional[LexToken] = None) -> None:
Exception.__init__(self, msg)
self.tok = tok

View File

@@ -0,0 +1,98 @@
import argparse
import dataclasses
import inspect
import subprocess
from .options import ParserOptions
from .simple import parse_string
def nondefault_repr(data):
"""
Similar to the default dataclass repr, but exclude any
default parameters or parameters with compare=False
"""
is_dataclass = dataclasses.is_dataclass
get_fields = dataclasses.fields
MISSING = dataclasses.MISSING
def _inner_repr(o) -> str:
if is_dataclass(o):
vals = []
for f in get_fields(o):
if f.repr and f.compare:
v = getattr(o, f.name)
if f.default_factory is not MISSING:
default = f.default_factory()
else:
default = f.default
if v != default:
vals.append(f"{f.name}={_inner_repr(v)}")
return f"{o.__class__.__qualname__ }({', '.join(vals)})"
elif isinstance(o, list):
return f"[{','.join(_inner_repr(l) for l in o)}]"
elif isinstance(o, dict):
vals = []
for k, v in o.items():
vals.append(f'"{k}": {_inner_repr(v)}')
return "{" + ",".join(vals) + "}"
else:
return repr(o)
return _inner_repr(data)
def gentest(infile: str, name: str, outfile: str, verbose: bool):
# Goal is to allow making a unit test as easy as running this dumper
# on a file and copy/pasting this into a test
with open(infile, "r") as fp:
content = fp.read()
options = ParserOptions(verbose=verbose)
data = parse_string(content, options=options)
stmt = nondefault_repr(data)
content = content.replace("\n", "\n ")
stmt = inspect.cleandoc(
f'''
def test_{name}():
content = """
{content}
"""
data = parse_string(content, cleandoc=True)
assert data == {stmt}
'''
)
# format it with black
stmt = subprocess.check_output(
["black", "-", "-q"], input=stmt.encode("utf-8")
).decode("utf-8")
if outfile == "-":
print(stmt)
else:
with open(outfile, "w") as fp:
fp.write(stmt)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("header")
parser.add_argument("name", nargs="?", default="TODO")
parser.add_argument("-v", "--verbose", default=False, action="store_true")
parser.add_argument("-o", "--output", default="-")
args = parser.parse_args()
gentest(args.header, args.name, args.output, args.verbose)

425
cxxheaderparser/lexer.py Normal file
View File

@@ -0,0 +1,425 @@
from collections import deque
import re
import typing
import sys
from ._ply import lex
if sys.version_info >= (3, 8):
Protocol = typing.Protocol
else:
Protocol = object
_line_re = re.compile(r'^#line (\d+) "(.*)"')
_multicomment_re = re.compile("\n[\\s]+\\*")
class Location(typing.NamedTuple):
"""
Location that something was found at, takes #line directives into account
"""
filename: str
lineno: int
class LexToken(Protocol):
"""
Token as emitted by PLY and modified by our lexer
"""
#: Lexer type for this token
type: str
#: Raw value for this token
value: str
lineno: int
lexpos: int
#: Location token was found at
location: Location
class Lexer:
keywords = {
"__attribute__",
"alignas",
"alignof",
"asm",
"auto",
"bool",
"break",
"case",
"catch",
"char",
"char8_t",
"char16_t",
"char32_t",
"class",
"const",
"constexpr",
"const_cast",
"continue",
"decltype",
"__declspec",
"default",
"delete",
"do",
"double",
"dynamic_cast",
"else",
"enum",
"explicit",
"export",
"extern",
"false",
"final",
"float",
"for",
"friend",
"goto",
"if",
"inline",
"int",
"long",
"mutable",
"namespace",
"new",
"noexcept",
"nullptr",
"nullptr_t", # not a keyword, but makes things easier
"operator",
"private",
"protected",
"public",
"register",
"reinterpret_cast",
"return",
"short",
"signed",
"sizeof",
"static",
"static_assert",
"static_cast",
"struct",
"switch",
"template",
"this",
"thread_local",
"throw",
"true",
"try",
"typedef",
"typeid",
"typename",
"union",
"unsigned",
"using",
"virtual",
"void",
"volatile",
"wchar_t",
"while",
}
tokens = [
"NUMBER",
"FLOAT_NUMBER",
"NAME",
"COMMENT_SINGLELINE",
"COMMENT_MULTILINE",
"PRECOMP_MACRO",
"DIVIDE",
"CHAR_LITERAL",
"STRING_LITERAL",
"NEWLINE",
"ELLIPSIS",
"DBL_LBRACKET",
"DBL_RBRACKET",
"DBL_COLON",
"DBL_AMP",
"SHIFT_LEFT",
] + list(keywords)
literals = [
"<",
">",
"(",
")",
"{",
"}",
"[",
"]",
";",
":",
",",
"\\",
"|",
"%",
"^",
"!",
"*",
"-",
"+",
"&",
"=",
"'",
".",
]
t_ignore = " \t\r?@\f"
t_NUMBER = r"[0-9][0-9XxA-Fa-f]*"
t_FLOAT_NUMBER = r"[-+]?[0-9]*\.[0-9]+([eE][-+]?[0-9]+)?"
def t_NAME(self, t):
r"[A-Za-z_~][A-Za-z0-9_]*"
if t.value in self.keywords:
t.type = t.value
return t
def t_PRECOMP_MACRO(self, t):
r"\#.*"
m = _line_re.match(t.value)
if m:
filename = m.group(2)
if filename not in self._filenames_set:
self.filenames.append(filename)
self._filenames_set.add(filename)
self.filename = filename
self.line_offset = 1 + self.lex.lineno - int(m.group(1))
else:
return t
def t_COMMENT_SINGLELINE(self, t):
r"\/\/.*\n?"
if t.value.startswith("///") or t.value.startswith("//!"):
self.comments.append(t.value.lstrip("\t ").rstrip("\n"))
t.lexer.lineno += t.value.count("\n")
return t
t_DIVIDE = r"/(?!/)"
t_CHAR_LITERAL = "'.'"
t_ELLIPSIS = r"\.\.\."
t_DBL_LBRACKET = r"\[\["
t_DBL_RBRACKET = r"\]\]"
t_DBL_COLON = r"::"
t_DBL_AMP = r"&&"
t_SHIFT_LEFT = r"<<"
# SHIFT_RIGHT introduces ambiguity
# found at http://wordaligned.org/articles/string-literals-and-regular-expressions
# TODO: This does not work with the string "bla \" bla"
t_STRING_LITERAL = r'"([^"\\]|\\.)*"'
# Found at http://ostermiller.org/findcomment.html
def t_COMMENT_MULTILINE(self, t):
r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/\n?"
if t.value.startswith("/**") or t.value.startswith("/*!"):
# not sure why, but get double new lines
v = t.value.replace("\n\n", "\n")
# strip prefixing whitespace
v = _multicomment_re.sub("\n*", v)
self.comments = v.splitlines()
t.lexer.lineno += t.value.count("\n")
return t
def t_NEWLINE(self, t):
r"\n+"
t.lexer.lineno += len(t.value)
del self.comments[:]
return t
def t_error(self, v):
print("Lex error: ", v)
_lexer = None
def __new__(cls, *args, **kwargs):
# only build the lexer once
inst = super().__new__(cls)
if cls._lexer is None:
cls._lexer = lex.lex(module=inst)
inst.lex = cls._lexer.clone(inst)
return inst
def __init__(self, filename: typing.Optional[str] = None):
self.input = self.lex.input
# For tracking current file/line position
self.filename = filename
self.line_offset = 0
self.filenames = []
self._filenames_set = set()
if self.filename:
self.filenames.append(filename)
self._filenames_set.add(filename)
# Doxygen comments
self.comments = []
self.lookahead = typing.Deque[LexToken]()
def current_location(self) -> Location:
if self.lookahead:
return self.lookahead[0].location
return Location(self.filename, self.lex.lineno - self.line_offset)
def get_doxygen(self) -> typing.Optional[str]:
"""
This should be called after the first element of something has
been consumed.
It will lookahead for comments that come after the item, if prior
comments don't exist.
"""
# Assumption: This function is either called at the beginning of a
# statement or at the end of a statement
if self.comments:
comments = self.comments
else:
comments = []
# only look for comments until a newline (including lookahead)
for tok in self.lookahead:
if tok.type == "NEWLINE":
return None
while True:
tok = self.lex.token()
comments.extend(self.comments)
if tok is None:
break
tok.location = Location(self.filename, tok.lineno - self.line_offset)
ttype = tok.type
if ttype == "NEWLINE":
self.lookahead.append(tok)
break
if ttype not in self._discard_types:
self.lookahead.append(tok)
if ttype == "NAME":
break
del self.comments[:]
comments = "\n".join(comments)
del self.comments[:]
if comments:
return comments
return None
_discard_types = {"NEWLINE", "COMMENT_SINGLELINE", "COMMENT_MULTILINE"}
def token(self) -> LexToken:
tok = None
while self.lookahead:
tok = self.lookahead.popleft()
if tok.type not in self._discard_types:
return tok
while True:
tok = self.lex.token()
if tok is None:
raise EOFError("unexpected end of file")
if tok.type not in self._discard_types:
tok.location = Location(self.filename, tok.lineno - self.line_offset)
break
return tok
def token_eof_ok(self) -> typing.Optional[LexToken]:
tok = None
while self.lookahead:
tok = self.lookahead.popleft()
if tok.type not in self._discard_types:
return tok
while True:
tok = self.lex.token()
if tok is None:
break
if tok.type not in self._discard_types:
tok.location = Location(self.filename, tok.lineno - self.line_offset)
break
return tok
def token_if(self, *types: str) -> typing.Optional[LexToken]:
tok = self.token_eof_ok()
if tok is None:
return None
if tok.type not in types:
# put it back on the left in case it was retrieved
# from the lookahead buffer
self.lookahead.appendleft(tok)
return None
return tok
def token_if_in_set(self, types: typing.Set[str]) -> typing.Optional[LexToken]:
tok = self.token_eof_ok()
if tok is None:
return None
if tok.type not in types:
# put it back on the left in case it was retrieved
# from the lookahead buffer
self.lookahead.appendleft(tok)
return None
return tok
def token_if_val(self, *vals: str) -> typing.Optional[LexToken]:
tok = self.token_eof_ok()
if tok is None:
return None
if tok.value not in vals:
# put it back on the left in case it was retrieved
# from the lookahead buffer
self.lookahead.appendleft(tok)
return None
return tok
def token_if_not(self, *types: str) -> typing.Optional[LexToken]:
tok = self.token_eof_ok()
if tok is None:
return None
if tok.type in types:
# put it back on the left in case it was retrieved
# from the lookahead buffer
self.lookahead.appendleft(tok)
return None
return tok
def token_peek_if(self, *types: str) -> bool:
tok = self.token_eof_ok()
if not tok:
return False
self.lookahead.appendleft(tok)
return tok.type in types
def return_token(self, tok: LexToken) -> None:
self.lookahead.appendleft(tok)
def return_tokens(self, toks: typing.Iterable[LexToken]) -> None:
self.lookahead.extendleft(reversed(toks))
if __name__ == "__main__":
try:
lex.runmain(lexer=Lexer(None))
except EOFError:
pass

View File

@@ -0,0 +1,14 @@
from dataclasses import dataclass
@dataclass
class ParserOptions:
"""
Options that control parsing behaviors
"""
#: If true, prints out
verbose: bool = False
#: If true, converts a single void parameter to zero parameters
convert_void_to_zero_params: bool = True

2082
cxxheaderparser/parser.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,114 @@
import typing
if typing.TYPE_CHECKING:
from .visitor import CxxVisitor
from .errors import CxxParseError
from .lexer import LexToken, Location
from .types import ClassDecl, NamespaceDecl
class ParsedTypeModifiers(typing.NamedTuple):
vars: typing.Dict[str, LexToken] # only found on variables
both: typing.Dict[str, LexToken] # found on either variables or functions
meths: typing.Dict[str, LexToken] # only found on methods
def validate(self, *, var_ok: bool, meth_ok: bool, msg: str) -> None:
# Almost there! Do any checks the caller asked for
if not var_ok and self.vars:
for tok in self.vars.values():
raise CxxParseError(f"{msg}: unexpected '{tok.value}'")
if not meth_ok and self.meths:
for tok in self.meths.values():
raise CxxParseError(f"{msg}: unexpected '{tok.value}'")
if not meth_ok and not var_ok and self.both:
for tok in self.both.values():
raise CxxParseError(f"{msg}: unexpected '{tok.value}'")
class State:
#: parent state
parent: typing.Optional["State"]
def __init__(self, parent: typing.Optional["State"]) -> None:
self.parent = parent
def _finish(self, visitor: "CxxVisitor") -> None:
pass
class BlockState(State):
#: Approximate location that the parsed element was found at
location: Location
class EmptyBlockState(BlockState):
def _finish(self, visitor: "CxxVisitor") -> None:
visitor.on_empty_block_end(self)
class ExternBlockState(BlockState):
#: The linkage for this extern block
linkage: str
def __init__(self, parent: typing.Optional[State], linkage: str) -> None:
super().__init__(parent)
self.linkage = linkage
def _finish(self, visitor: "CxxVisitor"):
visitor.on_extern_block_end(self)
class NamespaceBlockState(BlockState):
#: The incremental namespace for this block
namespace: NamespaceDecl
def __init__(
self, parent: typing.Optional[State], namespace: NamespaceDecl
) -> None:
super().__init__(parent)
self.namespace = namespace
def _finish(self, visitor: "CxxVisitor") -> None:
visitor.on_namespace_end(self)
class ClassBlockState(BlockState):
#: class decl block being processed
class_decl: ClassDecl
#: Current access level for items encountered
access: str
#: Currently parsing as a typedef
typedef: bool
#: modifiers to apply to following variables
mods: ParsedTypeModifiers
def __init__(
self,
parent: typing.Optional[State],
class_decl: ClassDecl,
access: str,
typedef: bool,
mods: ParsedTypeModifiers,
) -> None:
super().__init__(parent)
self.class_decl = class_decl
self.access = access
self.typedef = typedef
self.mods = mods
def _set_access(self, access: str) -> None:
self.access = access
def _finish(self, visitor: "CxxVisitor") -> None:
visitor.on_class_end(self)

294
cxxheaderparser/simple.py Normal file
View File

@@ -0,0 +1,294 @@
"""
The simple parser/collector iterates over the C++ file and returns a data
structure with all elements in it. Not quite as flexible as implementing
your own parser listener, but you can accomplish most things with it.
cxxheaderparser's unit tests predominantly use the simple API for parsing,
so you can expect it to be pretty stable.
"""
import inspect
import typing
from dataclasses import dataclass, field
from .types import (
ClassDecl,
EnumDecl,
Field,
ForwardDecl,
FriendDecl,
Function,
Method,
Typedef,
UsingAlias,
UsingDecl,
Variable,
)
from .parserstate import (
State,
EmptyBlockState,
ClassBlockState,
ExternBlockState,
NamespaceBlockState,
)
from .parser import CxxParser
from .options import ParserOptions
#
# Data structure
#
@dataclass
class ClassScope:
class_decl: ClassDecl
#: Nested classes
classes: typing.List["ClassScope"] = field(default_factory=list)
enums: typing.List[EnumDecl] = field(default_factory=list)
fields: typing.List[Field] = field(default_factory=list)
friends: typing.List[FriendDecl] = field(default_factory=list)
methods: typing.List[Method] = field(default_factory=list)
typedefs: typing.List[Typedef] = field(default_factory=list)
forward_decls: typing.List[ForwardDecl] = field(default_factory=list)
using: typing.List[UsingDecl] = field(default_factory=list)
using_alias: typing.List[UsingAlias] = field(default_factory=list)
@dataclass
class NamespaceScope:
name: str = ""
classes: typing.List["ClassScope"] = field(default_factory=list)
enums: typing.List[EnumDecl] = field(default_factory=list)
functions: typing.List[Method] = field(default_factory=list)
typedefs: typing.List[Typedef] = field(default_factory=list)
variables: typing.List[Variable] = field(default_factory=list)
forward_decls: typing.List[ForwardDecl] = field(default_factory=list)
using: typing.List[UsingDecl] = field(default_factory=list)
using_ns: typing.List[UsingDecl] = field(default_factory=list)
using_alias: typing.List[UsingAlias] = field(default_factory=list)
#: Child namespaces
namespaces: typing.Dict[str, "NamespaceScope"] = field(default_factory=dict)
Block = typing.Union[ClassScope, NamespaceScope]
@dataclass
class Define:
content: str
@dataclass
class Pragma:
content: str
@dataclass
class Include:
#: The filename includes the surrounding ``<>`` or ``"``
filename: str
@dataclass
class UsingNamespace:
ns: str
@dataclass
class ParsedData:
namespace: NamespaceScope = field(default_factory=lambda: NamespaceScope())
defines: typing.List[Define] = field(default_factory=list)
pragmas: typing.List[Pragma] = field(default_factory=list)
includes: typing.List[Include] = field(default_factory=list)
#
# Visitor implementation
#
class SimpleCxxVisitor:
"""
A simple visitor that stores all of the C++ elements passed to it
in an "easy" to use data structure
.. warning:: Names are not resolved, so items are stored in the scope that
they are found. For example:
.. code-block:: c++
namespace N {
class C;
}
class N::C {
void fn();
};
The 'C' class would be a forward declaration in the 'N' namespace,
but the ClassDecl for 'C' would be stored in the global
namespace instead of the 'N' namespace.
"""
data: ParsedData
namespace: NamespaceScope
block: Block
def __init__(self):
self.namespace = NamespaceScope("")
self.block = self.namespace
self.ns_stack = typing.Deque[NamespaceScope]()
self.block_stack = typing.Deque[Block]()
self.data = ParsedData(self.namespace)
def on_define(self, state: State, content: str) -> None:
self.data.defines.append(Define(content))
def on_pragma(self, state: State, content: str) -> None:
self.data.pragmas.append(Pragma(content))
def on_include(self, state: State, filename: str) -> None:
self.data.includes.append(Include(filename))
def on_empty_block_start(self, state: EmptyBlockState) -> None:
# this matters for some scope/resolving purposes, but you're
# probably going to want to use clang if you care about that
# level of detail
pass
def on_empty_block_end(self, state: EmptyBlockState) -> None:
pass
def on_extern_block_start(self, state: ExternBlockState) -> None:
pass # TODO
def on_extern_block_end(self, state: ExternBlockState) -> None:
pass
def on_namespace_start(self, state: NamespaceBlockState) -> None:
parent_ns = self.namespace
self.block_stack.append(parent_ns)
self.ns_stack.append(parent_ns)
ns = None
names = state.namespace.names
if not names:
# all anonymous namespaces in a translation unit are the same
names = [""]
for name in names:
ns = parent_ns.namespaces.get(name)
if ns is None:
ns = NamespaceScope(name)
parent_ns.namespaces[name] = ns
parent_ns = ns
self.block = ns
self.namespace = ns
def on_namespace_end(self, state: NamespaceBlockState) -> None:
self.block = self.block_stack.pop()
self.namespace = self.ns_stack.pop()
def on_forward_decl(self, state: State, fdecl: ForwardDecl) -> None:
self.block.forward_decls.append(fdecl)
def on_variable(self, state: State, v: Variable) -> None:
self.block.variables.append(v)
def on_function(self, state: State, fn: Function) -> None:
self.block.functions.append(fn)
def on_typedef(self, state: State, typedef: Typedef) -> None:
self.block.typedefs.append(typedef)
def on_using_namespace(self, state: State, namespace: typing.List[str]) -> None:
ns = UsingNamespace("::".join(namespace))
self.block.using_ns.append(ns)
def on_using_alias(self, state: State, using: UsingAlias):
self.block.using_alias.append(using)
def on_using_declaration(self, state: State, using: UsingDecl) -> None:
self.block.using.append(using)
#
# Enums
#
def on_enum(self, state: State, enum: EnumDecl) -> None:
self.block.enums.append(enum)
#
# Class/union/struct
#
def on_class_start(self, state: ClassBlockState) -> None:
block = ClassScope(state.class_decl)
self.block.classes.append(block)
self.block_stack.append(self.block)
self.block = block
def on_class_field(self, state: State, f: Field) -> None:
self.block.fields.append(f)
def on_class_method(self, state: ClassBlockState, method: Method) -> None:
self.block.methods.append(method)
def on_class_friend(self, state: ClassBlockState, friend: FriendDecl):
self.block.friends.append(friend)
def on_class_end(self, state: ClassBlockState) -> None:
self.block = self.block_stack.pop()
def parse_string(
content: str,
*,
filename="<str>",
options: typing.Optional[ParserOptions] = None,
cleandoc: bool = False,
) -> ParsedData:
"""
Simple function to parse a header and return a data structure
"""
if cleandoc:
content = inspect.cleandoc(content)
visitor = SimpleCxxVisitor()
parser = CxxParser(filename, content, visitor, options)
parser.parse()
return visitor.data
def parse_file(
filename: str,
encoding: typing.Optional[str] = None,
*,
options: typing.Optional[ParserOptions] = None,
) -> ParsedData:
"""
Simple function to parse a header from a file and return a data structure
"""
with open(filename, encoding=encoding) as fp:
content = fp.read()
return parse_string(content, filename=filename, options=options)

74
cxxheaderparser/tokfmt.py Normal file
View File

@@ -0,0 +1,74 @@
import typing
from .lexer import Lexer
from .types import Token
# key: token type, value: (left spacing, right spacing)
_want_spacing = {
"NUMBER": (2, 2),
"FLOAT_NUMBER": (2, 2),
"NAME": (2, 2),
"CHAR_LITERAL": (2, 2),
"STRING_LITERAL": (2, 2),
"ELLIPSIS": (2, 2),
">": (0, 2),
")": (0, 1),
"(": (1, 0),
",": (0, 3),
"*": (1, 2),
"&": (0, 2),
}
_want_spacing.update(dict.fromkeys(Lexer.keywords, (2, 2)))
def tokfmt(toks: typing.List[Token]) -> str:
"""
Helper function that takes a list of tokens and converts them to a string
"""
last = 0
vals = []
default = (0, 0)
ws = _want_spacing
for tok in toks:
value = tok.value
# special case
if value == "operator":
l, r = 2, 0
else:
l, r = ws.get(tok.type, default)
if l + last >= 3:
vals.append(" ")
last = r
vals.append(value)
return "".join(vals)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("header")
args = parser.parse_args()
lexer = Lexer(args.header)
with open(lexer.filename) as fp:
lexer.input(fp.read())
toks = []
while True:
tok = lexer.token_eof_ok()
if not tok:
break
if tok.type == ";":
print(toks)
print(tokfmt(toks))
toks = []
else:
toks.append(tok)
print(toks)
print(tokfmt(toks))

650
cxxheaderparser/types.py Normal file
View File

@@ -0,0 +1,650 @@
import typing
from dataclasses import dataclass, field
@dataclass
class Token:
"""
In an ideal world, this Token class would not be exposed via the user
visible API. Unfortunately, getting to that point would take a significant
amount of effort.
It is not expected that these will change, but they might.
At the moment, the only supported use of Token objects are in conjunction
with the ``tokfmt`` function. As this library matures, we'll try to clarify
the expectations around these. File an issue on github if you have ideas!
"""
#: Raw value of the token
value: str
#: Lex type of the token
type: str = field(repr=False, compare=False, default="")
@dataclass
class Value:
"""
A unparsed list of tokens
.. code-block:: c++
int x = 0x1337;
~~~~~~
"""
#: Tokens corresponding to the value
tokens: typing.List[Token]
@dataclass
class NamespaceDecl:
"""
Namespace declarations
.. code-block:: c++
namespace foo::bar {}
~~~~~~~~
"""
#: These are the names (split by ::) for this namespace declaration,
#: but does not include any parent namespace names
#:
#: An anonymous namespace is an empty list
names: typing.List[str]
inline: bool = False
@dataclass
class DecltypeSpecifier:
"""
Contents of a decltype (inside the parentheses)
.. code-block:: c++
decltype(Foo::Bar)
~~~~~~~~
"""
#: Unparsed tokens within the decltype
tokens: typing.List[Token]
@dataclass
class FundamentalSpecifier:
"""
A specifier that only contains fundamental types
"""
name: str
@dataclass
class NameSpecifier:
"""
An individual segment of a type name
.. code-block:: c++
Foo::Bar
~~~
"""
name: str
specialization: typing.Optional[typing.List["TemplateSpecialization"]] = None
@dataclass
class AutoSpecifier:
"""
Used for an auto return type
"""
name: str = "auto"
@dataclass
class AnonymousName:
"""
A name for an anonymous class, such as in a typedef. There is no string
associated with this name, only an integer id. Things that share the same
anonymous name have anonymous name instances with the same id
"""
#: Unique id associated with this name (only unique per parser instance!)
id: int
PQNameSegment = typing.Union[
AnonymousName, FundamentalSpecifier, NameSpecifier, DecltypeSpecifier, AutoSpecifier
]
@dataclass
class PQName:
"""
Possibly qualified name of a C++ type.
"""
#: All of the segments of the name. This is always guaranteed to have at
#: least one element in it. Name is segmented by '::'
#:
#: If a name refers to the global namespace, the first segment will be an
#: empty NameSpecifier
segments: typing.List[PQNameSegment]
#: Set if the name starts with class/enum/struct
classkey: typing.Optional[str] = None
@dataclass
class Enumerator:
"""
An individual value of an enumeration
"""
#: The enumerator key name
name: str
#: None if not explicitly specified
value: typing.Optional[Value] = None
#: Documentation if present
doxygen: typing.Optional[str] = None
@dataclass
class EnumDecl:
"""
An enumeration type
"""
typename: PQName
values: typing.List[Enumerator]
base: typing.Optional[PQName] = None
#: Documentation if present
doxygen: typing.Optional[str] = None
#: If within a class, the access level for this decl
access: typing.Optional[str] = None
@dataclass
class TemplateArgument:
"""
A single argument for a template specialization
.. code-block:: c++
Foo<int, Bar...>
~~~
"""
#: This contains unparsed arbitrary expressions, including additional
#: specializations or decltypes or whatever
tokens: typing.List[Token]
@dataclass
class TemplateSpecialization:
"""
Contains the arguments of a template specialization
.. code-block:: c++s
Foo<int, Bar...>
~~~~~~~~~~~
"""
args: typing.List[TemplateArgument]
#: If True, indicates a parameter pack (...) on the last parameter
param_pack: bool = False
@dataclass
class FunctionType:
"""
A function type, currently only used in a function pointer
.. note:: There can only be one of FunctionType or Type in a DecoratedType
chain
"""
return_type: "DecoratedType"
parameters: typing.List["Parameter"]
#: If a member function pointer
# TODO classname: typing.Optional[PQName]
#: Set to True if ends with ``...``
vararg: bool = False
@dataclass
class Type:
""""""
typename: PQName
const: bool = False
volatile: bool = False
def get_type(self) -> "Type":
return self
@dataclass
class Array:
"""
Information about an array. Multidimensional arrays are represented as
an array of array.
"""
#: The type that this is an array of
array_of: typing.Union["Array", "Pointer", Type]
#: Size of the array
#:
#: .. code-block:: c++
#:
#: int x[10];
#: ~~
size: typing.Optional[Value]
def get_type(self) -> Type:
return self.array_of.get_type()
@dataclass
class Pointer:
"""
A pointer
"""
#: Thing that this points to
ptr_to: typing.Union[Array, FunctionType, "Pointer", Type]
const: bool = False
volatile: bool = False
def get_type(self) -> Type:
return self.ptr_to.get_type()
@dataclass
class Reference:
"""
A lvalue (``&``) reference
"""
ref_to: typing.Union[Array, Pointer, Type]
def get_type(self) -> Type:
return self.ref_to.get_type()
@dataclass
class MoveReference:
"""
An rvalue (``&&``) reference
"""
moveref_to: typing.Union[Array, Pointer, Type]
def get_type(self) -> Type:
return self.moveref_to.get_type()
#: A type or function type that is decorated with various things
#:
#: .. note:: There can only be one of FunctionType or Type in a DecoratedType
#: chain
DecoratedType = typing.Union[Array, Pointer, MoveReference, Reference, Type]
@dataclass
class TemplateNonTypeParam:
"""
.. code-block:: c++
template <int T>
~~~~~
template <class T, typename T::type* U>
~~~~~~~~~~~~~~~~~~~
template <auto T>
~~~~~~
"""
type: DecoratedType
name: typing.Optional[str] = None
default: typing.Optional[Value] = None
#: Contains a ``...``
param_pack: bool = False
@dataclass
class TemplateTypeParam:
"""
.. code-block:: c++
template <typename T>
~~~~~~~~~~
"""
#: 'typename' or 'class'
typekey: str
name: typing.Optional[str] = None
param_pack: bool = False
default: typing.Optional[Value] = None
#: A template-template param
template: typing.Optional["TemplateDecl"] = None
#: A parameter for a template declaration
#:
#: .. code-block:: c++
#:
#: template <typename T>
#: ~~~~~~~~~~
TemplateParam = typing.Union[TemplateNonTypeParam, TemplateTypeParam]
@dataclass
class TemplateDecl:
"""
Template declaration for a function or class
.. code-block:: c++
template <typename T>
class Foo {};
template <typename T>
T fn();
"""
params: typing.List[TemplateParam] = field(default_factory=list)
@dataclass
class ForwardDecl:
"""
Represents a forward declaration of a user defined type
"""
typename: PQName
template: typing.Optional[TemplateDecl] = None
doxygen: typing.Optional[str] = None
#: Set if this is a forward declaration of an enum and it has a base
enum_base: typing.Optional[PQName] = None
#: If within a class, the access level for this decl
access: typing.Optional[str] = None
@dataclass
class BaseClass:
"""
Base class declarations for a class
"""
#: access specifier for this base
access: str
#: possibly qualified type name for the base
typename: PQName
#: Virtual inheritance
virtual: bool = False
#: Contains a ``...``
param_pack: bool = False
@dataclass
class ClassDecl:
"""
A class is a user defined type (class/struct/union)
"""
typename: PQName
bases: typing.List[BaseClass] = field(default_factory=list)
template: typing.Optional[TemplateDecl] = None
explicit: bool = False
final: bool = False
doxygen: typing.Optional[str] = None
#: If within a class, the access level for this decl
access: typing.Optional[str] = None
@property
def classkey(self) -> str:
return self.typename.classkey
@dataclass
class Parameter:
"""
A parameter of a function/method
"""
type: DecoratedType
name: typing.Optional[str] = None
default: typing.Optional[Value] = None
param_pack: bool = False
@dataclass
class Function:
"""
A function declaration, potentially with the function body
"""
return_type: DecoratedType
name: PQName
parameters: typing.List[Parameter]
#: Set to True if ends with ``...``
vararg: bool = False
doxygen: typing.Optional[str] = None
constexpr: bool = False
extern: typing.Union[bool, str] = False
static: bool = False
inline: bool = False
#: If true, the body of the function is present
has_body: bool = False
template: typing.Optional[TemplateDecl] = None
throw: typing.Optional[Value] = None
noexcept: typing.Optional[Value] = None
@dataclass
class Method(Function):
"""
A method declaration, potentially with the method body
"""
#: constructors and destructors don't have a return type
return_type: typing.Optional[DecoratedType]
access: str = ""
const: bool = False
volatile: bool = False
#: ref-qualifier for this method, either lvalue (&) or rvalue (&&)
#:
#: .. code-block:: c++
#:
#: void foo() &&;
#: ~~
#:
ref_qualifier: typing.Optional[str] = None
constructor: bool = False
explicit: bool = False
default: bool = False
deleted: bool = False
destructor: bool = False
pure_virtual: bool = False
virtual: bool = False
final: bool = False
override: bool = False
@dataclass
class Operator(Method):
operator: str = ""
@dataclass
class FriendDecl:
"""
Represents a friend declaration -- friends can only be classes or functions
"""
cls: typing.Optional[ForwardDecl] = None
fn: typing.Optional[Function] = None
@dataclass
class Typedef:
"""
A typedef specifier. A unique typedef specifier is created for each alias
created by the typedef.
.. code-block:: c++
typedef type name, *pname;
"""
#: The aliased type
#:
#: .. code-block:: c++
#:
#: typedef type *pname;
#: ~~~~~~
type: DecoratedType
#: The alias introduced for the specified type
#:
#: .. code-block:: c++
#:
#: typedef type *pname;
#: ~~~~~
name: str
#: If within a class, the access level for this decl
access: typing.Optional[str] = None
@dataclass
class Variable:
"""
A variable declaration
"""
name: PQName
type: DecoratedType
value: typing.Optional[Value] = None
constexpr: bool = False
extern: typing.Union[bool, str] = False
static: bool = False
inline: bool = False
#: Can occur for a static variable for a templated class
template: typing.Optional[TemplateDecl] = None
doxygen: typing.Optional[str] = None
@dataclass
class Field:
"""
A field of a class
"""
#: public/private/protected
access: str
type: DecoratedType
name: typing.Optional[str] = None
value: typing.Optional[Value] = None
bits: typing.Optional[int] = None
constexpr: bool = False
mutable: bool = False
static: bool = False
doxygen: typing.Optional[str] = None
@dataclass
class UsingDecl:
"""
.. code-block:: c++
using NS::ClassName;
"""
typename: PQName
#: If within a class, the access level for this decl
access: typing.Optional[str] = None
@dataclass
class UsingAlias:
"""
.. code-block:: c++
using foo = int;
template <typename T>
using VectorT = std::vector<T>;
"""
alias: str
type: DecoratedType
template: typing.Optional[TemplateDecl] = None
#: If within a class, the access level for this decl
access: typing.Optional[str] = None

197
cxxheaderparser/visitor.py Normal file
View File

@@ -0,0 +1,197 @@
import sys
import typing
if sys.version_info >= (3, 8):
Protocol = typing.Protocol
else:
Protocol = object
from .types import (
EnumDecl,
Field,
ForwardDecl,
FriendDecl,
Function,
Method,
Typedef,
UsingAlias,
UsingDecl,
Variable,
)
from .parserstate import (
State,
EmptyBlockState,
ClassBlockState,
ExternBlockState,
NamespaceBlockState,
)
class CxxVisitor(Protocol):
"""
Defines the interface used by the parser to emit events
"""
def on_define(self, state: State, content: str) -> None:
"""
.. warning:: cxxheaderparser intentionally does not have a C preprocessor
implementation. If you are parsing code with macros in it,
use a conforming preprocessor like ``pcpp``
"""
def on_pragma(self, state: State, content: str) -> None:
"""
Called once for each ``#pragma`` directive encountered
"""
def on_include(self, state: State, filename: str) -> None:
"""
Called once for each ``#include`` directive encountered
"""
def on_empty_block_start(self, state: EmptyBlockState) -> None:
"""
Called when a ``{`` is encountered that isn't associated with or
consumed by other declarations.
.. code-block:: c++
{
// stuff
}
"""
def on_empty_block_end(self, state: EmptyBlockState) -> None:
...
def on_extern_block_start(self, state: ExternBlockState) -> None:
"""
.. code-block:: c++
extern "C" {
}
"""
def on_extern_block_end(self, state: ExternBlockState) -> None:
...
def on_namespace_start(self, state: NamespaceBlockState) -> None:
"""
Called when a ``namespace`` directive is encountered
"""
def on_namespace_end(self, state: NamespaceBlockState) -> None:
"""
Called at the end of a ``namespace`` block
"""
def on_forward_decl(self, state: State, fdecl: ForwardDecl) -> None:
"""
Called when a forward declaration is encountered
"""
def on_variable(self, state: State, v: Variable) -> None:
...
def on_function(self, state: State, fn: Function) -> None:
...
def on_typedef(self, state: State, typedef: Typedef) -> None:
"""
Called for each typedef instance encountered. For example:
.. code-block:: c++
typedef int T, *PT;
Will result in ``on_typedef`` being called twice, once for ``T`` and
once for ``*PT``
"""
def on_using_namespace(self, state: State, namespace: typing.List[str]) -> None:
"""
.. code-block:: c++
using namespace std;
"""
def on_using_alias(self, state: State, using: UsingAlias):
"""
.. code-block:: c++
using foo = int;
template <typename T>
using VectorT = std::vector<T>;
"""
def on_using_declaration(self, state: State, using: UsingDecl) -> None:
"""
.. code-block:: c++
using NS::ClassName;
"""
#
# Enums
#
def on_enum(self, state: State, enum: EnumDecl) -> None:
"""
Called after an enum is encountered
"""
#
# Class/union/struct
#
def on_class_start(self, state: ClassBlockState) -> None:
"""
Called when a class/struct/union is encountered
When part of a typedef:
.. code-block:: c++
typedef struct { } X;
This is called first, followed by on_typedef for each typedef instance
encountered. The compound type object is passed as the type to the
typedef.
"""
def on_class_field(self, state: ClassBlockState, f: Field) -> None:
"""
Called when a field of a class is encountered
"""
def on_class_friend(self, state: ClassBlockState, friend: FriendDecl):
"""
Called when a friend declaration is encountered
"""
def on_class_method(self, state: ClassBlockState, method: Method) -> None:
"""
Called when a method of a class is encountered
"""
def on_class_end(self, state: ClassBlockState) -> None:
"""
Called when the end of a class/struct/union is encountered.
When a variable like this is declared:
.. code-block:: c++
struct X {
} x;
Then ``on_class_start``, .. ``on_class_end`` are emitted, along with
``on_variable`` for each instance declared.
"""