Skip to content

Commit

Permalink
Handle nameIdentifiers as dict rather than array
Browse files Browse the repository at this point in the history
This is similar to issue #51, but for another field.
To check for similar errors a sample of a single file sample
for each  publisher was created by running and the running
the following.

mkdir sample
cd sample
tar xzvf ../datacite.tar.gz $(awk -F/ '!s[$3] {print "./" $3 "/part_00000.jsonl"; s[$3]++}' <(tar tzvf ../datacite.tar.gz))
tar czvf ../sample.tar.gz .
  • Loading branch information
dspinellis committed Jul 1, 2024
1 parent 346eabf commit f4c28e0
Show file tree
Hide file tree
Showing 4 changed files with 268 additions and 19 deletions.
217 changes: 217 additions & 0 deletions src/alexandria3k/TAGS
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@

common.py,527
class Alexandria3kError(44,1215
def __init__(50,1499
class Alexandria3kInternalError(55,1599
def __init__(60,1856
def is_unittest(65,1956
def warn(74,2198
def ensure_unlinked(86,2467
def query_result(97,2755
def table_exists(112,3112
def ensure_table_exists(129,3568
def set_fast_writing(148,4130
def log_sql(174,5176
def try_sql_execute(186,5488
def program_version(207,6245
def is_url(222,6710
def data_from_uri_provider(232,6868
def get_string_resource(259,7771
def remove_sqlite_comments(271,8106

csv_source.py,359
class VTSource:VTSource32,1103
def __init__(37,1293
def get_container_iterator(42,1466
def get_container_name(46,1621
def Create(50,1771
class CsvCursor:CsvCursor62,2190
def __init__(65,2256
def Eof(78,2645
def Rowid(82,2769
def Column(86,2884
def Filter(99,3283
def Next(113,3910
def Close(125,4297

data_source.py,2267
class StreamingTable:StreamingTable66,1988
def __init__(69,2086
def BestIndex(78,2391
def Disconnect(83,2569
def sample(88,2697
def get_table_meta_by_name(93,2908
def get_table_meta(97,3047
def get_data_source(101,3157
def cursor(105,3272
def Open(117,3858
def get_value_extractor_by_ordinal(121,3970
class StreamingCachedContainerTable(127,4237
def BestIndex(133,4532
class ElementsCursor:ElementsCursor174,6293
def __init__(180,6442
def Filter(192,6865
def Eof(199,7084
def Rowid(204,7232
def record_id(208,7331
def current_row_value(212,7457
def Next(217,7628
def container_id(221,7725
def Column(226,7917
def Close(237,8273
class ItemsCursor:ItemsCursor243,8412
def __init__(247,8542
def Rowid(258,8906
def current_row_value(262,9021
def Eof(266,9145
def Close(270,9269
class FilesCursor(275,9370
def __init__(279,9507
def debug_progress_bar(285,9741
def Filter(304,10390
def Next(318,10989
class _IndexManager:_IndexManager341,11737
def __init__(345,11853
def create_index(350,11996
def drop_indexes(365,12480
class DataSource:DataSource372,12700
def __init__(397,13621
def get_table_meta_by_name(451,15556
def tables_transitive_closure(458,15823
def get_virtual_db(470,16305
def add_column(475,16444
def set_query_columns(483,16715
def trace_query_columns(487,16872
def authorizer(492,17087
def tracer(503,17599
def query(523,18368
def get_query_column_names(604,21796
def populate(609,22026
def set_join_columns(646,23516
def query_and_population_tables(666,24458
def joined_tables(674,24766
def partition_condition(711,26591
def populate_only_root_table(721,26998
def populate_table(745,27819
def add_columns(779,29250
def create_database_schema(799,30069
def create_matched_tables(839,31626
def run_post_population_script(886,33760
class DataFiles:DataFiles943,36138
def __init__(946,36206
def get_file_array(970,37013
def get_container_iterator(974,37120
def get_container_name(978,37281

db_schema.py,780
class TableMeta:TableMeta22,839
def __init__(26,950
def table_schema(50,1884
def insert_statement(60,2410
def get_name(74,2918
def get_primary_key(78,3006
def get_foreign_key(83,3161
def get_extract_multiple(87,3303
def get_parent_extract_multiple(91,3442
def get_post_population_script(95,3617
def get_parent_name(99,3776
def get_cursor_class(103,3914
def get_columns(107,4036
def get_value_extractor_by_ordinal(111,4133
def get_value_extractor_by_name(115,4315
def get_column_definition_by_name(119,4500
class ColumnMeta:ColumnMeta124,4679
def __init__(127,4741
def get_name(134,5019
def get_definition(138,5104
def get_description(147,5519
def get_value_extractor(151,5633

debug.py,109
def set_output(43,1164
def get_output(56,1496
def set_flags(62,1611
def enabled(84,2247
def log(97,2484

file_cache.py,110
class FileCache:FileCache25,884
def __init__(33,1063
def read(37,1156
def get_file_cache(57,1730

file_pubmed_cache.py,110
class FileCache:FileCache25,899
def __init__(33,1077
def read(37,1170
def get_file_cache(55,1656

file_xml_cache.py,110
class FileCache:FileCache24,864
def __init__(30,1012
def read(34,1110
def get_file_cache(61,1857

__main__.py,1099
def module_get_attribute(43,1242
def module_name(49,1466
def class_name(55,1673
def facility_modules(63,2002
def facility_names(71,2330
def get_data_source_instance(78,2551
def populate(102,3570
def add_subcommand_populate(118,4020
def process(173,5623
def add_subcommand_process(182,5922
def add_subcommand_help(202,6578
def top_level_help(205,6684
def query(213,6913
def add_subcommand_query(240,7852
def get_tables(316,10083
def list_facility_schema(322,10287
def add_subcommand_list_complete_schema(336,10762
def list_complete_schema(339,10867
def add_subcommand_list_source_schema(352,11280
def list_source_schema(355,11381
def add_subcommand_list_process_schema(369,11831
def list_process_schema(372,11934
def list_facility_description(386,12384
def add_subcommand_list_processes(409,13314
def list_processes(412,13407
def add_subcommand_list_sources(422,13704
def list_sources(425,13793
def add_subcommand_version(435,14086
def show_version(438,14165
def get_cli_parser(446,14414
def error_raising_main(497,16147
def main(522,16766

perf.py,17
def log(34,1067

tsort.py,19
def tsort(41,1398

uspto_zip_cache.py,117
class UsptoZipCache:UsptoZipCache27,973
def __init__(33,1125
def read(38,1242
def get_zip_cache(84,2902

xml.py,342
def get_element(28,1092
def get_attribute(37,1306
def getter(48,1679
def agetter(54,1843
def all_getter(60,2021
def getter_by_attribute(65,2142
def fgetter(69,2346
def lower(79,2586
def lfunc(83,2705
def get_root_text(90,2827
class XMLCursor(95,2937
def __init__(99,3052
def Next(108,3448
def Rowid(134,4421

__init__.py,0
8 changes: 6 additions & 2 deletions src/alexandria3k/data_sources/datacite.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,12 @@ def current_row_value(self):
# containing a dict. Detect and fix.
for relation in ["creators", "contributors"]:
for creator in self.json_data[relation]:
if isinstance(creator.get("affiliation"), dict):
creator["affiliation"] = [creator["affiliation"]]
affiliation = creator.get("affiliation")
if isinstance(affiliation, dict):
creator["affiliation"] = [affiliation]
name_identifiers = creator.get("nameIdentifiers")
if isinstance(name_identifiers, dict):
creator["nameIdentifiers"] = [name_identifiers]
self.cached_json_item_index = self.item_index
return self.json_data

Expand Down
Binary file modified tests/data/datacite.tar.gz
Binary file not shown.
62 changes: 45 additions & 17 deletions tests/data_sources/test_datacite.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,43 +49,71 @@ def tearDownClass(cls):
cls.con.close()
os.unlink(DATABASE_PATH)

def test_counts(self):
self.assertEqual(self.record_count("dc_works"), 9)
self.assertEqual(self.record_count("dc_work_creators"), 29)
self.assertEqual(self.record_count("dc_creator_name_identifiers"), 8)
self.assertEqual(self.record_count("dc_creator_affiliations"), 17)
self.assertEqual(self.record_count("dc_work_titles"), 9)
self.assertEqual(self.record_count("dc_work_subjects"), 27)
self.assertEqual(self.record_count("dc_work_contributors"), 1)
def test_count_dc_works(self):
self.assertEqual(self.record_count("dc_works"), 10)

def test_count_dc_work_creators(self):
self.assertEqual(self.record_count("dc_work_creators"), 30)

def test_count_dc_creator_name_identifiers(self):
self.assertEqual(self.record_count("dc_creator_name_identifiers"), 9)

def test_count_dc_creator_affiliations(self):
self.assertEqual(self.record_count("dc_creator_affiliations"), 18)

def test_count_dc_work_titles(self):
self.assertEqual(self.record_count("dc_work_titles"), 10)

def test_count_dc_work_subjects(self):
self.assertEqual(self.record_count("dc_work_subjects"), 31)

def test_count_dc_work_contributors(self):
self.assertEqual(self.record_count("dc_work_contributors"), 2)

def test_count_dc_contributor_name_identifiers(self):
self.assertEqual(self.record_count("dc_contributor_name_identifiers"), 0)
self.assertEqual(self.record_count("dc_contributor_affiliations"), 1)
self.assertEqual(self.record_count("dc_work_dates"), 7)

def test_count_dc_contributor_affiliations(self):
self.assertEqual(self.record_count("dc_contributor_affiliations"), 2)

def test_count_dc_work_dates(self):
self.assertEqual(self.record_count("dc_work_dates"), 8)

def test_count_dc_work_related_identifiers(self):
self.assertEqual(self.record_count("dc_work_related_identifiers"), 20)
self.assertEqual(self.record_count("dc_work_descriptions"), 12)

def test_count_dc_work_descriptions(self):
self.assertEqual(self.record_count("dc_work_descriptions"), 13)

def test_count_dc_work_geo_locations(self):
self.assertEqual(self.record_count("dc_work_geo_locations"), 2)
self.assertEqual(self.record_count("dc_work_funding_references"), 2)

def test_count_dc_work_funding_references(self):
self.assertEqual(self.record_count("dc_work_funding_references"), 2)

def test_count_distinct_name_identifiers(self):
self.assertEqual(
self.record_count(
"""(SELECT DISTINCT name_identifier
FROM dc_creator_name_identifiers)"""
),
8,
9,
)

def test_count_distinct_work_creators(self):
self.assertEqual(
self.record_count(
"""(SELECT DISTINCT work_id
FROM dc_work_creators)"""
),
9,
10,
)

def test_count_distinct_works(self):
self.assertEqual(self.record_count(
"""(SELECT DISTINCT container_id FROM dc_works)"""
),
4,
5,
)

def test_work_contents(self):
Expand Down Expand Up @@ -229,7 +257,7 @@ def tearDownClass(cls):
os.unlink(DATABASE_PATH)

def test_counts(self):
self.assertEqual(self.record_count("dc_works"), 9)
self.assertEqual(self.record_count("dc_works"), 10)

def test_no_extra_fields(self):
with self.assertRaises(sqlite3.OperationalError):
Expand Down Expand Up @@ -262,7 +290,7 @@ def tearDownClass(cls):
os.unlink(DATABASE_PATH)

def test_counts(self):
self.assertEqual(self.record_count("dc_works"), 6)
self.assertEqual(self.record_count("dc_works"), 7)

def test_no_extra_fields(self):
with self.assertRaises(sqlite3.OperationalError):
Expand Down

0 comments on commit f4c28e0

Please sign in to comment.