forked from Curiosio/wiki_import
-
Notifications
You must be signed in to change notification settings - Fork 0
/
import_wikipedia_test.py
107 lines (89 loc) · 3.65 KB
/
import_wikipedia_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python
import unittest
import xml
import re
from import_wikipedia import WikiXmlHandler, extact_general
DUMP = """<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">
<siteinfo>
<sitename>Wikipedia</sitename>
<dbname>enwiki</dbname>
<base>https://en.wikipedia.org/wiki/Main_Page</base>
<generator>MediaWiki 1.28.0-wmf.15</generator>
<case>first-letter</case>
<namespaces>
<namespace key="-2" case="first-letter">Media</namespace>
</namespaces>
</siteinfo>
<page>
<title>AccessibleComputing</title>
<ns>0</ns>
<id>10</id>
<redirect title="Computer accessibility" />
<revision>
<id>631144794</id>
<parentid>381202555</parentid>
<timestamp>2014-10-26T04:50:23Z</timestamp>
<contributor>
<username>Paine Ellsworth</username>
<id>9092818</id>
</contributor>
<comment>add [[WP:RCAT|rcat]]s</comment>
<model>wikitext</model>
<format>text/x-wiki</format>
<text xml:space="preserve">#REDIRECT [[Computer accessibility]]
{{Redr|move|from CamelCase|up}}</text>
<sha1>4ro7vvppa5kmm0o1egfjztzcwd0vabw</sha1>
</revision>
</page>
<page>
<title>Anarchism</title>
<ns>0</ns>
<id>12</id>
<revision>
<id>734566960</id>
<timestamp>2016-08-15T06:01:51Z</timestamp>
<model>wikitext</model>
<format>text/x-wiki</format>
<text xml:space="preserve">{{Redirect2|Anarchist|Anarchists|the fictional character|Anarchist (comics)|other uses|Anarchists (disambiguation)}}
{{Basic forms of government}}
'''Anarchism''' is a [[political philosophy]] that advocates [[self-governance|self-governed]] societies based on voluntary institutions. These are often described
<--This is a *citation* from a book, DON'T CHANGE-->
===First International and the Paris Commune===
{{Main article|International Workingmen's Association|Paris Commune}}
[[File:Bakunin.png|thumb|upright|Collectivist anarchist [[Mikhail Bakunin]] opposed the
[[Category:Anti-fascism]]
[[Category:Ideas of idealists]]
[[Category:Anti-capitalism]]
[[Category:Far-left politics]]</text>
<sha1>az60vahaazg403faw6x2gzpbmiws0o3</sha1>
</revision>
</page>
</mediawiki>"""
RE_PAR = re.compile('\(([^\)]+)\)')
class FakeCursor():
def __init__(self):
self.results = []
def execute(self, sql, params):
g = RE_PAR.search(sql)
fields = [x.strip() for x in g.group(1).split(',')]
self.results.append(dict(zip(fields, params)))
class TestImportWikipedia(unittest.TestCase):
def test_parse_wikipedia(self):
parser = xml.sax.make_parser()
fc = FakeCursor()
parser.setContentHandler(WikiXmlHandler(fc))
for line in DUMP.split('\n'):
parser.feed(line + '\n')
self.assertEqual(len(fc.results), 2)
self.assertEqual(fc.results[0]['title'], 'AccessibleComputing')
self.assertTrue('redr' in fc.results[0]['templates'])
self.assertTrue("<--This is a *citation* from a book, DON'T CHANGE-->" in fc.results[1]['wikitext'])
self.assertTrue('main article' in fc.results[1]['templates'])
self.assertTrue('ideas' in fc.results[1]['general'])
def test_extact_general(self):
self.assertEqual(extact_general('something something dark'), None)
self.assertEqual(extact_general('the streets of philadelpha'), 'the streets')
self.assertEqual(extact_general('paintings by dutch potato eaters'), 'paintings')
self.assertEqual(extact_general('Cities in trouble'), 'Cities')
if __name__ == '__main__':
unittest.main()