Skip to content

Commit

Permalink
Merge pull request arrow-py#249 from beenje/fuzzy
Browse files Browse the repository at this point in the history
Search date in strings. Thanks @beenje
  • Loading branch information
andrewelkins committed Aug 7, 2015
2 parents 842495d + a33a1ef commit 8b0a560
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 48 deletions.
76 changes: 28 additions & 48 deletions arrow/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@

from datetime import datetime
from dateutil import tz

import calendar
import re

from arrow import locales
Expand Down Expand Up @@ -78,9 +76,9 @@ def parse_iso(self, string):

if has_time:
if space_divider:
date_string, time_string = string.split(' ', 1)
date_string, time_string = string.split(' ', 1)
else:
date_string, time_string = string.split('T', 1)
date_string, time_string = string.split('T', 1)
time_parts = re.split('[+-]', time_string, 1)
has_tz = len(time_parts) > 1
has_seconds = time_parts[0].count(':') > 1
Expand Down Expand Up @@ -114,57 +112,39 @@ def parse(self, string, fmt):
if isinstance(fmt, list):
return self._parse_multiformat(string, fmt)

original_string = string
tokens = self._FORMAT_RE.findall(fmt)
token_values = []
separators = self._parse_separators(fmt, tokens)
parts = {}

for token in tokens:
# fmt is a string of tokens like 'YYYY-MM-DD'
# we construct a new string by replacing each
# token by its pattern:
# 'YYYY-MM-DD' -> '(?P<YYYY>\d{4})-(?P<MM>\d{2})-(?P<DD>\d{2})'
fmt_pattern = fmt
tokens = []
offset = 0
for m in self._FORMAT_RE.finditer(fmt):
token = m.group(0)
try:
input_re = self._input_re_map[token]
except KeyError:
raise ParserError('Unrecognized token \'{0}\''.format(token))

match = input_re.search(string)

if match:
token_values.append(match.group(0))
if 'value' in match.groupdict():
self._parse_token(token, match.groupdict()['value'], parts)
else:
self._parse_token(token, match.group(0), parts)
index = match.span(0)[1]
string = string[index:]

input_pattern = '(?P<{0}>{1})'.format(token, input_re.pattern)
tokens.append(token)
# a pattern doesn't have the same length as the token
# it replaces! We keep the difference in the offset variable.
# This works because the string is scanned left-to-right and matches
# are returned in the order found by finditer.
fmt_pattern = fmt_pattern[:m.start() + offset] + input_pattern + fmt_pattern[m.end() + offset:]
offset += len(input_pattern) - (m.end() - m.start())
match = re.search(fmt_pattern, string, flags=re.IGNORECASE)
if match is None:
raise ParserError('Failed to match \'{0}\' when parsing \'{1}\''.format(fmt_pattern, string))
parts = {}
for token in tokens:
if token == 'Do':
value = match.group('value')
else:
raise ParserError('Failed to match token \'{0}\' when parsing \'{1}\''.format(token, original_string))

parsed = ''.join(self._interleave_lists(token_values, separators))
if parsed not in original_string:
raise ParserError('Failed to match format \'{0}\' when parsing \'{1}\''.format(fmt, original_string))

value = match.group(token)
self._parse_token(token, value, parts)
return self._build_datetime(parts)

def _interleave_lists(self, tokens, separators):

joined = tokens + separators
joined[::2] = tokens
joined[1::2] = separators

return joined

def _parse_separators(self, fmt, tokens):

separators = []

for i in range(len(tokens) - 1):
start_index = fmt.find(tokens[i]) + len(tokens[i])
end_index = fmt.find(tokens[i + 1])
separators.append(fmt[start_index:end_index])

return separators

def _parse_token(self, token, value, parts):

if token == 'YYYY':
Expand Down
7 changes: 7 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,13 @@ Parse from a string:
>>> arrow.get('2013-05-05 12:30:45', 'YYYY-MM-DD HH:mm:ss')
<Arrow [2013-05-05T12:30:45+00:00]>
Search a date in a string:

.. code-block:: python
>>> arrow.get('June was born in May 1980', 'MMMM YYYY')
<Arrow [1980-05-01T00:00:00+00:00]>
Many ISO-8601 compliant strings are recognized and parsed without a format string:

>>> arrow.get('2013-09-30T15:34:00.000-07:00')
Expand Down
45 changes: 45 additions & 0 deletions tests/parser_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,3 +621,48 @@ def test_french(self):

assertEqual(parser_.parse('Janvier 11e, 2013', 'MMMM Do, YYYY'),
datetime(2013, 1, 11))


class DateTimeParserSearchDateTests(Chai):

def setUp(self):
super(DateTimeParserSearchDateTests, self).setUp()
self.parser = parser.DateTimeParser()

def test_parse_search(self):

assertEqual(
self.parser.parse('Today is 25 of September of 2003', 'DD of MMMM of YYYY'),
datetime(2003, 9, 25))

def test_parse_seach_with_numbers(self):

assertEqual(
self.parser.parse('2000 people met the 2012-01-01 12:05:10', 'YYYY-MM-DD HH:mm:ss'),
datetime(2012, 1, 1, 12, 5, 10))

assertEqual(
self.parser.parse('Call 01-02-03 on 79-01-01 12:05:10', 'YY-MM-DD HH:mm:ss'),
datetime(1979, 1, 1, 12, 5, 10))

def test_parse_seach_with_names(self):

assertEqual(
self.parser.parse('June was born in May 1980', 'MMMM YYYY'),
datetime(1980, 5, 1))

def test_parse_seach_locale_with_names(self):
p = parser.DateTimeParser('sv_se')

assertEqual(
p.parse('Jan föddes den 31 Dec 1980', 'DD MMM YYYY'),
datetime(1980, 12, 31))

assertEqual(
p.parse('Jag föddes den 25 Augusti 1975', 'DD MMMM YYYY'),
datetime(1975, 8, 25))

def test_parse_seach_fails(self):

with assertRaises(parser.ParserError):
self.parser.parse('Jag föddes den 25 Augusti 1975', 'DD MMMM YYYY')

0 comments on commit 8b0a560

Please sign in to comment.