Skip to content

Commit

Permalink
Merge pull request arrow-py#382 from ownaginatious/cached_parsing
Browse files Browse the repository at this point in the history
Cached compiled format strings to speed up repetitive parsing. Upgrade chai and python-dateutil Thanks @ownaginatious
  • Loading branch information
andrewelkins authored Dec 2, 2016
2 parents 7cc5872 + 8c0f58e commit 4996bc9
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 14 deletions.
40 changes: 28 additions & 12 deletions arrow/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
from datetime import datetime
from dateutil import tz
import re

try:
from functools import lru_cache
except ImportError: # pragma: no cover
from backports.functools_lru_cache import lru_cache # pragma: no cover

from arrow import locales


Expand Down Expand Up @@ -50,7 +56,7 @@ class DateTimeParser(object):
MARKERS = ['YYYY', 'MM', 'DD']
SEPARATORS = ['-', '/', '.']

def __init__(self, locale='en_us'):
def __init__(self, locale='en_us', cache_size=0):

self.locale = locales.get_locale(locale)
self._input_re_map = self._BASE_INPUT_RE_MAP.copy()
Expand All @@ -62,14 +68,17 @@ def __init__(self, locale='en_us'):
'dddd': self._choice_re(self.locale.day_names[1:], re.IGNORECASE),
'ddd': self._choice_re(self.locale.day_abbreviations[1:],
re.IGNORECASE),
'd' : re.compile("[1-7]"),
'd': re.compile(r"[1-7]"),
'a': self._choice_re(
(self.locale.meridians['am'], self.locale.meridians['pm'])
),
# note: 'A' token accepts both 'am/pm' and 'AM/PM' formats to
# ensure backwards compatibility of this token
'A': self._choice_re(self.locale.meridians.values())
})
if cache_size > 0:
self._generate_pattern_re =\
lru_cache(maxsize=cache_size)(self._generate_pattern_re)

def parse_iso(self, string):

Expand Down Expand Up @@ -98,8 +107,8 @@ def parse_iso(self, string):
# using various separators: -, /, .
l = len(self.MARKERS)
formats = [separator.join(self.MARKERS[:l-i])
for i in range(l)
for separator in self.SEPARATORS]
for i in range(l)
for separator in self.SEPARATORS]

if has_time and has_tz:
formats = [f + 'Z' for f in formats]
Expand All @@ -109,10 +118,7 @@ def parse_iso(self, string):

return self._parse_multiformat(string, formats)

def parse(self, string, fmt):

if isinstance(fmt, list):
return self._parse_multiformat(string, fmt)
def _generate_pattern_re(self, fmt):

# fmt is a string of tokens like 'YYYY-MM-DD'
# we construct a new string by replacing each
Expand All @@ -122,7 +128,7 @@ def parse(self, string, fmt):
offset = 0

# Extract the bracketed expressions to be reinserted later.
escaped_fmt = re.sub(self._ESCAPE_RE, "#" , fmt)
escaped_fmt = re.sub(self._ESCAPE_RE, "#", fmt)
# Any number of S is the same as one.
escaped_fmt = re.sub('S+', 'S', escaped_fmt)
escaped_data = re.findall(self._ESCAPE_RE, fmt)
Expand Down Expand Up @@ -154,11 +160,21 @@ def parse(self, string, fmt):
if i < len(b):
final_fmt_pattern += b[i][1:-1]

match = re.search(final_fmt_pattern, string, flags=re.IGNORECASE)
return tokens, re.compile(final_fmt_pattern, flags=re.IGNORECASE)

def parse(self, string, fmt):

if isinstance(fmt, list):
return self._parse_multiformat(string, fmt)

fmt_tokens, fmt_pattern_re = self._generate_pattern_re(fmt)

match = fmt_pattern_re.search(string)
if match is None:
raise ParserError('Failed to match \'{0}\' when parsing \'{1}\''.format(final_fmt_pattern, string))
raise ParserError('Failed to match \'{0}\' when parsing \'{1}\''
.format(fmt_pattern_re.pattern, string))
parts = {}
for token in tokens:
for token in fmt_tokens:
if token == 'Do':
value = match.group('value')
else:
Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
python-dateutil==2.1
python-dateutil==2.6.0
nose==1.3.0
nose-cov==1.6
chai==0.4.8
chai==1.1.1
sphinx==1.3.5
simplejson==3.6.5
backports.functools_lru_cache==1.2.1
38 changes: 38 additions & 0 deletions tests/parser_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,44 @@ def test_parse_token_invalid_meridians(self):
self.parser._parse_token('a', 'p..m', parts)
assertEqual(parts, {})

def test_parser_no_caching(self):

expect(parser.DateTimeParser, '_generate_pattern_re').args('fmt_a').times(100)
self.parser = parser.DateTimeParser(cache_size=0)
for _ in range(100):
self.parser._generate_pattern_re('fmt_a')

def test_parser_1_line_caching(self):

expect(parser.DateTimeParser, '_generate_pattern_re').args('fmt_a').times(1)
self.parser = parser.DateTimeParser(cache_size=1)
for _ in range(100):
self.parser._generate_pattern_re('fmt_a')

expect(parser.DateTimeParser, '_generate_pattern_re').args('fmt_b').times(1)
for _ in range(100):
self.parser._generate_pattern_re('fmt_a')
self.parser._generate_pattern_re('fmt_b')

expect(parser.DateTimeParser, '_generate_pattern_re').args('fmt_a').times(1)
for _ in range(100):
self.parser._generate_pattern_re('fmt_a')

def test_parser_multiple_line_caching(self):

expect(parser.DateTimeParser, '_generate_pattern_re').args('fmt_a').times(1)
self.parser = parser.DateTimeParser(cache_size=2)
for _ in range(100):
self.parser._generate_pattern_re('fmt_a')

expect(parser.DateTimeParser, '_generate_pattern_re').args('fmt_b').times(1)
for _ in range(100):
self.parser._generate_pattern_re('fmt_a')
self.parser._generate_pattern_re('fmt_b')

expect(parser.DateTimeParser, '_generate_pattern_re').args('fmt_a').times(0)
for _ in range(100):
self.parser._generate_pattern_re('fmt_a')


class DateTimeParserParseTests(Chai):
Expand Down

0 comments on commit 4996bc9

Please sign in to comment.