Skip to content

Commit

Permalink
Search date in strings
Browse files Browse the repository at this point in the history
Unknown tokens in the string are ignored.

In the previous implementation, each token was matched individually.
With the format 'YYYY-MM-DD', parsing the string '2000 ... 2015-12-31'
would fail because 'YYYY' taken alone matches the first 4 numbers.

Same with the format 'MMMM YYYY' and the string 'June was born in May
1980'. 'MMMM' would match 'June' instead of 'May' even if there are no
digits next to it.

This change builds a complete pattern string by replacing each token by
its pattern. 'YYYY-MM-DD' becomes '(?P<YYYY>\d{4})-(?P<MM>\d{2})-(?P<DD>\d{2})'.
The search is done with that regular expression.
That matches '2015-12-31' in the previous example.
  • Loading branch information
beenje committed Jul 26, 2015
1 parent 8869524 commit a33a1ef
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 48 deletions.
76 changes: 28 additions & 48 deletions arrow/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@

from datetime import datetime
from dateutil import tz

import calendar
import re

from arrow import locales
Expand Down Expand Up @@ -78,9 +76,9 @@ def parse_iso(self, string):

if has_time:
if space_divider:
date_string, time_string = string.split(' ', 1)
date_string, time_string = string.split(' ', 1)
else:
date_string, time_string = string.split('T', 1)
date_string, time_string = string.split('T', 1)
time_parts = re.split('[+-]', time_string, 1)
has_tz = len(time_parts) > 1
has_seconds = time_parts[0].count(':') > 1
Expand Down Expand Up @@ -114,57 +112,39 @@ def parse(self, string, fmt):
if isinstance(fmt, list):
return self._parse_multiformat(string, fmt)

original_string = string
tokens = self._FORMAT_RE.findall(fmt)
token_values = []
separators = self._parse_separators(fmt, tokens)
parts = {}

for token in tokens:
# fmt is a string of tokens like 'YYYY-MM-DD'
# we construct a new string by replacing each
# token by its pattern:
# 'YYYY-MM-DD' -> '(?P<YYYY>\d{4})-(?P<MM>\d{2})-(?P<DD>\d{2})'
fmt_pattern = fmt
tokens = []
offset = 0
for m in self._FORMAT_RE.finditer(fmt):
token = m.group(0)
try:
input_re = self._input_re_map[token]
except KeyError:
raise ParserError('Unrecognized token \'{0}\''.format(token))

match = input_re.search(string)

if match:
token_values.append(match.group(0))
if 'value' in match.groupdict():
self._parse_token(token, match.groupdict()['value'], parts)
else:
self._parse_token(token, match.group(0), parts)
index = match.span(0)[1]
string = string[index:]

input_pattern = '(?P<{0}>{1})'.format(token, input_re.pattern)
tokens.append(token)
# a pattern doesn't have the same length as the token
# it replaces! We keep the difference in the offset variable.
# This works because the string is scanned left-to-right and matches
# are returned in the order found by finditer.
fmt_pattern = fmt_pattern[:m.start() + offset] + input_pattern + fmt_pattern[m.end() + offset:]
offset += len(input_pattern) - (m.end() - m.start())
match = re.search(fmt_pattern, string, flags=re.IGNORECASE)
if match is None:
raise ParserError('Failed to match \'{0}\' when parsing \'{1}\''.format(fmt_pattern, string))
parts = {}
for token in tokens:
if token == 'Do':
value = match.group('value')
else:
raise ParserError('Failed to match token \'{0}\' when parsing \'{1}\''.format(token, original_string))

parsed = ''.join(self._interleave_lists(token_values, separators))
if parsed not in original_string:
raise ParserError('Failed to match format \'{0}\' when parsing \'{1}\''.format(fmt, original_string))

value = match.group(token)
self._parse_token(token, value, parts)
return self._build_datetime(parts)

def _interleave_lists(self, tokens, separators):

joined = tokens + separators
joined[::2] = tokens
joined[1::2] = separators

return joined

def _parse_separators(self, fmt, tokens):

separators = []

for i in range(len(tokens) - 1):
start_index = fmt.find(tokens[i]) + len(tokens[i])
end_index = fmt.find(tokens[i + 1])
separators.append(fmt[start_index:end_index])

return separators

def _parse_token(self, token, value, parts):

if token == 'YYYY':
Expand Down
7 changes: 7 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,13 @@ Parse from a string:
>>> arrow.get('2013-05-05 12:30:45', 'YYYY-MM-DD HH:mm:ss')
<Arrow [2013-05-05T12:30:45+00:00]>
Search a date in a string:

.. code-block:: python
>>> arrow.get('June was born in May 1980', 'MMMM YYYY')
<Arrow [1980-05-01T00:00:00+00:00]>
Many ISO-8601 compliant strings are recognized and parsed without a format string:

>>> arrow.get('2013-09-30T15:34:00.000-07:00')
Expand Down
45 changes: 45 additions & 0 deletions tests/parser_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,3 +621,48 @@ def test_french(self):

assertEqual(parser_.parse('Janvier 11e, 2013', 'MMMM Do, YYYY'),
datetime(2013, 1, 11))


class DateTimeParserSearchDateTests(Chai):

def setUp(self):
super(DateTimeParserSearchDateTests, self).setUp()
self.parser = parser.DateTimeParser()

def test_parse_search(self):

assertEqual(
self.parser.parse('Today is 25 of September of 2003', 'DD of MMMM of YYYY'),
datetime(2003, 9, 25))

def test_parse_seach_with_numbers(self):

assertEqual(
self.parser.parse('2000 people met the 2012-01-01 12:05:10', 'YYYY-MM-DD HH:mm:ss'),
datetime(2012, 1, 1, 12, 5, 10))

assertEqual(
self.parser.parse('Call 01-02-03 on 79-01-01 12:05:10', 'YY-MM-DD HH:mm:ss'),
datetime(1979, 1, 1, 12, 5, 10))

def test_parse_seach_with_names(self):

assertEqual(
self.parser.parse('June was born in May 1980', 'MMMM YYYY'),
datetime(1980, 5, 1))

def test_parse_seach_locale_with_names(self):
p = parser.DateTimeParser('sv_se')

assertEqual(
p.parse('Jan föddes den 31 Dec 1980', 'DD MMM YYYY'),
datetime(1980, 12, 31))

assertEqual(
p.parse('Jag föddes den 25 Augusti 1975', 'DD MMMM YYYY'),
datetime(1975, 8, 25))

def test_parse_seach_fails(self):

with assertRaises(parser.ParserError):
self.parser.parse('Jag föddes den 25 Augusti 1975', 'DD MMMM YYYY')

0 comments on commit a33a1ef

Please sign in to comment.