forked from chromium/chromium
-
Notifications
You must be signed in to change notification settings - Fork 0
/
annotation_tokenizer.py
123 lines (97 loc) · 3.76 KB
/
annotation_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Copyright 2019 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""
A tokenizer for traffic annotation definitions.
"""
from collections import namedtuple
import re
# Regexen that match a token inside the annotation definition arguments. Stored
# as a list instead of a dict, to preserve order.
#
# Order matters because otherwise, 'symbol' could be parsed before
# 'string_literal' (i.e., R"(...)" would be misinterpreted as the symbol 'R',
# followed by a string with parentheses in it).
TOKEN_REGEXEN = [
# Comma for separating args.
('comma', re.compile(r'(,)')),
# String literal. "string" or R"(string)".
('string_literal',
re.compile(r'"((?:[^"]|\\.)*?)"|R"\((.*?)\)"', re.DOTALL)),
# C++ identifier.
('symbol', re.compile(r'([a-zA-Z_][a-zA-Z_0-9]*)')),
# Left parenthesis.
('left_paren', re.compile(r'(\()')),
# Right parenthesis.
('right_paren', re.compile(r'(\))')),
]
# Number of characters to include in the context (for error reporting).
CONTEXT_LENGTH = 20
Token = namedtuple('Token', ['type', 'value', 'pos'])
class CppParsingError(Exception):
"""An error during C++ parsing/tokenizing."""
def __init__(self, expected_type, body, pos, file_path, line_number):
context = body[pos:pos + CONTEXT_LENGTH]
msg = ("Expected {} in annotation definition at {}:{}.\n" +
"near '{}'").format(expected_type, file_path, line_number, context)
Exception.__init__(self, msg)
class Tokenizer:
"""Simple tokenizer with basic error reporting.
Use advance() or maybe_advance() to take tokens from the string, one at a
time.
"""
def __init__(self, body, file_path, line_number):
self.body = body
self.pos = 0
self.file_path = file_path
self.line_number = line_number
def _assert_token_type(self, token, expected_type):
"""Like assert(), but reports errors in a _somewhat_ useful way."""
if token and token.type == expected_type:
return
# Skip whitespace to make the error message more useful.
pos = self._skip_whitespace()
raise CppParsingError(expected_type, self.body, pos, self.file_path,
self.line_number)
def _skip_whitespace(self):
"""Return the position of the first non-whitespace character from here."""
whitespace_re = re.compile(r'\s*')
return whitespace_re.match(self.body, self.pos).end()
def _get_token(self):
"""Return the token here, or None on failure."""
# Skip initial whitespace.
pos = self._skip_whitespace()
# Find the token here, if there's one.
token = None
for (token_type, regex) in TOKEN_REGEXEN:
re_match = regex.match(self.body, pos)
if re_match:
token_content = next(g for g in re_match.groups() if g is not None)
token = Token(token_type, token_content, re_match.end())
break
return token
def maybe_advance(self, expected_type):
"""Advance the tokenizer by one token if it has |expected_type|.
Args:
expected_type: expected |type| attribute of the token.
Returns:
The |value| attribute of the token if it has the right type, or None if it
has another type.
"""
token = self._get_token()
if token and token.type == expected_type:
self.pos = token.pos
return token.value
return None
def advance(self, expected_type):
"""Advance the tokenizer by one token, asserting its type.
Throws an error if the token at point has the wrong type.
Args:
expected_type: expected |type| attribute of the token.
Returns:
The |value| attribute of the token at point.
"""
token = self._get_token()
self._assert_token_type(token, expected_type)
self.pos = token.pos
return token.value