Merge pull request arrow-py#249 from beenje/fuzzy

Search date in strings. Thanks @beenje
rene-armida · Aug 7, 2015 · 8b0a560 · 8b0a560
2 parents 842495d + a33a1ef
commit 8b0a560
Show file tree

Hide file tree

Showing 3 changed files with 80 additions and 48 deletions.
diff --git a/arrow/parser.py b/arrow/parser.py
@@ -4,8 +4,6 @@
 
 from datetime import datetime
 from dateutil import tz
-
-import calendar
 import re
 
 from arrow import locales
@@ -78,9 +76,9 @@ def parse_iso(self, string):
 
         if has_time:
             if space_divider:
-               date_string, time_string = string.split(' ', 1)
+                date_string, time_string = string.split(' ', 1)
             else:
-               date_string, time_string = string.split('T', 1)
+                date_string, time_string = string.split('T', 1)
             time_parts = re.split('[+-]', time_string, 1)
             has_tz = len(time_parts) > 1
             has_seconds = time_parts[0].count(':') > 1
@@ -114,57 +112,39 @@ def parse(self, string, fmt):
         if isinstance(fmt, list):
             return self._parse_multiformat(string, fmt)
 
-        original_string = string
-        tokens = self._FORMAT_RE.findall(fmt)
-        token_values = []
-        separators = self._parse_separators(fmt, tokens)
-        parts = {}
-
-        for token in tokens:
+        # fmt is a string of tokens like 'YYYY-MM-DD'
+        # we construct a new string by replacing each
+        # token by its pattern:
+        # 'YYYY-MM-DD' -> '(?P<YYYY>\d{4})-(?P<MM>\d{2})-(?P<DD>\d{2})'
+        fmt_pattern = fmt
+        tokens = []
+        offset = 0
+        for m in self._FORMAT_RE.finditer(fmt):
+            token = m.group(0)
             try:
                 input_re = self._input_re_map[token]
             except KeyError:
                 raise ParserError('Unrecognized token \'{0}\''.format(token))
-
-            match = input_re.search(string)
-
-            if match:
-                token_values.append(match.group(0))
-                if 'value' in match.groupdict():
-                    self._parse_token(token, match.groupdict()['value'], parts)
-                else:
-                    self._parse_token(token, match.group(0), parts)
-                index = match.span(0)[1]
-                string = string[index:]
-
+            input_pattern = '(?P<{0}>{1})'.format(token, input_re.pattern)
+            tokens.append(token)
+            # a pattern doesn't have the same length as the token
+            # it replaces! We keep the difference in the offset variable.
+            # This works because the string is scanned left-to-right and matches
+            # are returned in the order found by finditer.
+            fmt_pattern = fmt_pattern[:m.start() + offset] + input_pattern + fmt_pattern[m.end() + offset:]
+            offset += len(input_pattern) - (m.end() - m.start())
+        match = re.search(fmt_pattern, string, flags=re.IGNORECASE)
+        if match is None:
+            raise ParserError('Failed to match \'{0}\' when parsing \'{1}\''.format(fmt_pattern, string))
+        parts = {}
+        for token in tokens:
+            if token == 'Do':
+                value = match.group('value')
             else:
-                raise ParserError('Failed to match token \'{0}\' when parsing \'{1}\''.format(token, original_string))
-
-        parsed = ''.join(self._interleave_lists(token_values, separators))
-        if parsed not in original_string:
-            raise ParserError('Failed to match format \'{0}\' when parsing \'{1}\''.format(fmt, original_string))
-
+                value = match.group(token)
+            self._parse_token(token, value, parts)
         return self._build_datetime(parts)
 
-    def _interleave_lists(self, tokens, separators):
-
-        joined = tokens + separators
-        joined[::2] = tokens
-        joined[1::2] = separators
-
-        return joined
-
-    def _parse_separators(self, fmt, tokens):
-
-        separators = []
-
-        for i in range(len(tokens) - 1):
-            start_index = fmt.find(tokens[i]) + len(tokens[i])
-            end_index = fmt.find(tokens[i + 1])
-            separators.append(fmt[start_index:end_index])
-
-        return separators
-
     def _parse_token(self, token, value, parts):
 
         if token == 'YYYY':

diff --git a/docs/index.rst b/docs/index.rst
@@ -138,6 +138,13 @@ Parse from a string:
     >>> arrow.get('2013-05-05 12:30:45', 'YYYY-MM-DD HH:mm:ss')
     <Arrow [2013-05-05T12:30:45+00:00]>
 
+Search a date in a string:
+
+.. code-block:: python
+
+    >>> arrow.get('June was born in May 1980', 'MMMM YYYY')
+    <Arrow [1980-05-01T00:00:00+00:00]>
+
 Many ISO-8601 compliant strings are recognized and parsed without a format string:
 
     >>> arrow.get('2013-09-30T15:34:00.000-07:00')

diff --git a/tests/parser_tests.py b/tests/parser_tests.py
@@ -621,3 +621,48 @@ def test_french(self):
 
         assertEqual(parser_.parse('Janvier 11e, 2013', 'MMMM Do, YYYY'),
                     datetime(2013, 1, 11))
+
+
+class DateTimeParserSearchDateTests(Chai):
+
+    def setUp(self):
+        super(DateTimeParserSearchDateTests, self).setUp()
+        self.parser = parser.DateTimeParser()
+
+    def test_parse_search(self):
+
+        assertEqual(
+            self.parser.parse('Today is 25 of September of 2003', 'DD of MMMM of YYYY'),
+            datetime(2003, 9, 25))
+
+    def test_parse_seach_with_numbers(self):
+
+        assertEqual(
+            self.parser.parse('2000 people met the 2012-01-01 12:05:10', 'YYYY-MM-DD HH:mm:ss'),
+            datetime(2012, 1, 1, 12, 5, 10))
+
+        assertEqual(
+            self.parser.parse('Call 01-02-03 on 79-01-01 12:05:10', 'YY-MM-DD HH:mm:ss'),
+            datetime(1979, 1, 1, 12, 5, 10))
+
+    def test_parse_seach_with_names(self):
+
+        assertEqual(
+            self.parser.parse('June was born in May 1980', 'MMMM YYYY'),
+            datetime(1980, 5, 1))
+
+    def test_parse_seach_locale_with_names(self):
+        p = parser.DateTimeParser('sv_se')
+
+        assertEqual(
+            p.parse('Jan föddes den 31 Dec 1980', 'DD MMM YYYY'),
+            datetime(1980, 12, 31))
+
+        assertEqual(
+            p.parse('Jag föddes den 25 Augusti 1975', 'DD MMMM YYYY'),
+            datetime(1975, 8, 25))
+
+    def test_parse_seach_fails(self):
+
+        with assertRaises(parser.ParserError):
+            self.parser.parse('Jag föddes den 25 Augusti 1975', 'DD MMMM YYYY')