PYTHON   50

tokenize.py

Guest on 2nd August 2021 03:32:36 PM

  1. #!/usr/bin/env python
  2. #
  3. # Copyright  Neal Norwitz
  4. # Portions Copyright  Google Inc.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License");
  7. # you may not use this file except in compliance with the License.
  8. # You may obtain a copy of the License at
  9. #
  10. #      http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17.  
  18. """Tokenize C++ source code."""
  19.  
  20. __author__ = 'nnorwitz@google.com (Neal Norwitz)'
  21.  
  22.  
  23. try:
  24.     # Python 3.x
  25.     import builtins
  26. except ImportError:
  27.     # Python 2.x
  28.     import __builtin__ as builtins
  29.  
  30.  
  31. import sys
  32.  
  33. from cpp import utils
  34.  
  35.  
  36. if not hasattr(builtins, 'set'):
  37.     # Nominal support for Python 2.3.
  38.     from sets import Set as set
  39.  
  40.  
  41. # Add $ as a valid identifier char since so much code uses it.
  42. _letters = 'abcdefghijklmnopqrstuvwxyz'
  43. VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
  44. HEX_DIGITS = set('0123456789abcdefABCDEF')
  45. INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
  46.  
  47.  
  48. # C++0x string preffixes.
  49. _STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
  50.  
  51.  
  52. # Token types.
  53. UNKNOWN = 'UNKNOWN'
  54. SYNTAX = 'SYNTAX'
  55. CONSTANT = 'CONSTANT'
  56. NAME = 'NAME'
  57. PREPROCESSOR = 'PREPROCESSOR'
  58.  
  59. # Where the token originated from.  This can be used for backtracking.
  60. # It is always set to WHENCE_STREAM in this code.
  61. WHENCE_STREAM, WHENCE_QUEUE = range(2)
  62.  
  63.  
  64. class Token(object):
  65.     """Data container to represent a C++ token.
  66.  
  67.    Tokens can be identifiers, syntax char(s), constants, or
  68.    pre-processor directives.
  69.  
  70.    start contains the index of the first char of the token in the source
  71.    end contains the index of the last char of the token in the source
  72.    """
  73.  
  74.     def __init__(self, token_type, name, start, end):
  75.         self.token_type = token_type
  76.         self.name = name
  77.         self.start = start
  78.         self.end = end
  79.         self.whence = WHENCE_STREAM
  80.  
  81.     def __str__(self):
  82.         if not utils.DEBUG:
  83.             return 'Token(%r)' % self.name
  84.         return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
  85.  
  86.     __repr__ = __str__
  87.  
  88.  
  89. def _GetString(source, start, i):
  90.     i = source.find('"', i+1)
  91.     while source[i-1] == '\\':
  92.         # Count the trailing backslashes.
  93.         backslash_count = 1
  94.         j = i - 2
  95.         while source[j] == '\\':
  96.             backslash_count += 1
  97.             j -= 1
  98.         # When trailing backslashes are even, they escape each other.
  99.         if (backslash_count % 2) == 0:
  100.             break
  101.         i = source.find('"', i+1)
  102.     return i + 1
  103.  
  104.  
  105. def _GetChar(source, start, i):
  106.     # NOTE(nnorwitz): may not be quite correct, should be good enough.
  107.     i = source.find("'", i+1)
  108.     while source[i-1] == '\\':
  109.         # Need to special case '\\'.
  110.         if (i - 2) > start and source[i-2] == '\\':
  111.             break
  112.         i = source.find("'", i+1)
  113.     # Try to handle unterminated single quotes (in a #if 0 block).
  114.     if i < 0:
  115.         i = start
  116.     return i + 1
  117.  
  118.  
  119. def GetTokens(source):
  120.     """Returns a sequence of Tokens.
  121.  
  122.    Args:
  123.      source: string of C++ source code.
  124.  
  125.    Yields:
  126.      Token that represents the next token in the source.
  127.    """
  128.     # Cache various valid character sets for speed.
  129.     valid_identifier_chars = VALID_IDENTIFIER_CHARS
  130.     hex_digits = HEX_DIGITS
  131.     int_or_float_digits = INT_OR_FLOAT_DIGITS
  132.     int_or_float_digits2 = int_or_float_digits | set('.')
  133.  
  134.     # Only ignore errors while in a #if 0 block.
  135.     ignore_errors = False
  136.     count_ifs = 0
  137.  
  138.     i = 0
  139.     end = len(source)
  140.     while i < end:
  141.         # Skip whitespace.
  142.         while i < end and source[i].isspace():
  143.             i += 1
  144.         if i >= end:
  145.             return
  146.  
  147.         token_type = UNKNOWN
  148.         start = i
  149.         c = source[i]
  150.         if c.isalpha() or c == '_':              # Find a string token.
  151.             token_type = NAME
  152.             while source[i] in valid_identifier_chars:
  153.                 i += 1
  154.             # String and character constants can look like a name if
  155.             # they are something like L"".
  156.             if (source[i] == "'" and (i - start) == 1 and
  157.                 source[start:i] in 'uUL'):
  158.                 # u, U, and L are valid C++0x character preffixes.
  159.                 token_type = CONSTANT
  160.                 i = _GetChar(source, start, i)
  161.             elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
  162.                 token_type = CONSTANT
  163.                 i = _GetString(source, start, i)
  164.         elif c == '/' and source[i+1] == '/':    # Find // comments.
  165.             i = source.find('\n', i)
  166.             if i == -1:  # Handle EOF.
  167.                 i = end
  168.             continue
  169.         elif c == '/' and source[i+1] == '*':    # Find /* comments. */
  170.             i = source.find('*/', i) + 2
  171.             continue
  172.         elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
  173.             token_type = SYNTAX
  174.             i += 1
  175.             new_ch = source[i]
  176.             if new_ch == c and c != '>':         # Treat ">>" as two tokens.
  177.                 i += 1
  178.             elif c == '-' and new_ch == '>':
  179.                 i += 1
  180.             elif new_ch == '=':
  181.                 i += 1
  182.         elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
  183.             token_type = SYNTAX
  184.             i += 1
  185.             if c == '.' and source[i].isdigit():
  186.                 token_type = CONSTANT
  187.                 i += 1
  188.                 while source[i] in int_or_float_digits:
  189.                     i += 1
  190.                 # Handle float suffixes.
  191.                 for suffix in ('l', 'f'):
  192.                     if suffix == source[i:i+1].lower():
  193.                         i += 1
  194.                         break
  195.         elif c.isdigit():                        # Find integer.
  196.             token_type = CONSTANT
  197.             if c == '0' and source[i+1] in 'xX':
  198.                 # Handle hex digits.
  199.                 i += 2
  200.                 while source[i] in hex_digits:
  201.                     i += 1
  202.             else:
  203.                 while source[i] in int_or_float_digits2:
  204.                     i += 1
  205.             # Handle integer (and float) suffixes.
  206.             for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
  207.                 size = len(suffix)
  208.                 if suffix == source[i:i+size].lower():
  209.                     i += size
  210.                     break
  211.         elif c == '"':                           # Find string.
  212.             token_type = CONSTANT
  213.             i = _GetString(source, start, i)
  214.         elif c == "'":                           # Find char.
  215.             token_type = CONSTANT
  216.             i = _GetChar(source, start, i)
  217.         elif c == '#':                           # Find pre-processor command.
  218.             token_type = PREPROCESSOR
  219.             got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
  220.             if got_if:
  221.                 count_ifs += 1
  222.             elif source[i:i+6] == '#endif':
  223.                 count_ifs -= 1
  224.                 if count_ifs == 0:
  225.                     ignore_errors = False
  226.  
  227.             # TODO(nnorwitz): handle preprocessor statements (\ continuations).
  228.             while 1:
  229.                 i1 = source.find('\n', i)
  230.                 i2 = source.find('//', i)
  231.                 i3 = source.find('/*', i)
  232.                 i4 = source.find('"', i)
  233.                 # NOTE(nnorwitz): doesn't handle comments in #define macros.
  234.                 # Get the first important symbol (newline, comment, EOF/end).
  235.                 i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
  236.  
  237.                 # Handle #include "dir//foo.h" properly.
  238.                 if source[i] == '"':
  239.                     i = source.find('"', i+1) + 1
  240.                     assert i > 0
  241.                     continue
  242.                 # Keep going if end of the line and the line ends with \.
  243.                 if not (i == i1 and source[i-1] == '\\'):
  244.                     if got_if:
  245.                         condition = source[start+4:i].lstrip()
  246.                         if (condition.startswith('0') or
  247.                             condition.startswith('(0)')):
  248.                             ignore_errors = True
  249.                     break
  250.                 i += 1
  251.         elif c == '\\':                          # Handle \ in code.
  252.             # This is different from the pre-processor \ handling.
  253.             i += 1
  254.             continue
  255.         elif ignore_errors:
  256.             # The tokenizer seems to be in pretty good shape.  This
  257.             # raise is conditionally disabled so that bogus code
  258.             # in an #if 0 block can be handled.  Since we will ignore
  259.             # it anyways, this is probably fine.  So disable the
  260.             # exception and  return the bogus char.
  261.             i += 1
  262.         else:
  263.             sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
  264.                              ('?', i, c, source[i-10:i+10]))
  265.             raise RuntimeError('unexpected token')
  266.  
  267.         if i <= 0:
  268.             print('Invalid index, exiting now.')
  269.             return
  270.         yield Token(token_type, source[start:i], start, i)
  271.  
  272.  
  273. if __name__ == '__main__':
  274.     def main(argv):
  275.         """Driver mostly for testing purposes."""
  276.         for filename in argv[1:]:
  277.             source = utils.ReadFile(filename)
  278.             if source is None:
  279.                 continue
  280.  
  281.             for token in GetTokens(source):
  282.                 print('%-12s: %s' % (token.token_type, token.name))
  283.                 # print('\r%6.2f%%' % (100.0 * index / token.end),)
  284.             sys.stdout.write('\n')
  285.  
  286.  
  287.     main(sys.argv)

Raw Paste


Login or Register to edit or fork this paste. It's free.