diff --git a/docs/_sources/feature_selection_transforms.md.txt b/docs/_sources/feature_selection_transforms.md.txt index 3f89ee1..0cf35b0 100644 --- a/docs/_sources/feature_selection_transforms.md.txt +++ b/docs/_sources/feature_selection_transforms.md.txt @@ -21,6 +21,9 @@ few utility attributes which are available for all transforms: - `checkpoint` - Type: `boolean`. Optional. If set to true, checkpoint the dataset in Spark before computing the feature selection. This can reduce some resource usage for very complex workflows, but should not be necessary. +- `skip` - Type: `boolean`. Optional. If set to true, don't compute this + feature selection. This has the same effect as commenting the feature + selection out of your config file. ## bigrams diff --git a/docs/feature_selection_transforms.html b/docs/feature_selection_transforms.html index ee95d70..8c80e34 100644 --- a/docs/feature_selection_transforms.html +++ b/docs/feature_selection_transforms.html @@ -55,6 +55,9 @@

Feature Selection Transformscheckpoint - Type: boolean. Optional. If set to true, checkpoint the dataset in Spark before computing the feature selection. This can reduce some resource usage for very complex workflows, but should not be necessary.

+
  • skip - Type: boolean. Optional. If set to true, don’t compute this +feature selection. This has the same effect as commenting the feature +selection out of your config file.

  • bigrams

    diff --git a/docs/searchindex.js b/docs/searchindex.js index c3a4121..02d7cf8 100644 --- a/docs/searchindex.js +++ b/docs/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["column_mappings", "comparison_types", "config", "feature_selection_transforms", "index", "installation", "introduction", "link_tasks", "models", "pipeline_features", "running_the_program", "substitutions", "use_examples"], "filenames": ["column_mappings.md", "comparison_types.md", "config.md", "feature_selection_transforms.md", "index.rst", "installation.md", "introduction.md", "link_tasks.md", "models.md", "pipeline_features.md", "running_the_program.md", "substitutions.md", "use_examples.md"], "titles": ["Column Mappings", "Comparison types, transform add-ons, aggregate features, and household aggregate features", "Configuration", "Feature Selection Transforms", "Welcome to hlink\u2019s documentation!", "Installation", "Introduction", "Link Tasks", "Models", "Pipeline generated features", "Running hlink", "Substitutions", "Advanced Workflow Examples"], "terms": {"each": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10], "read": [0, 1, 2, 7, 10], "from": [0, 1, 2, 4, 6, 7, 8, 9, 10, 12], "input": [0, 1, 2, 3, 6, 7, 10, 11], "dataset": [0, 1, 2, 3, 6, 7, 10, 12], "hlink": [0, 1, 2, 3, 5, 6, 7, 12], "It": [0, 1, 2, 6, 10, 12], "ha": [0, 1, 2, 6, 10, 12], "column_nam": [0, 1, 2, 11], "attribut": [0, 1, 2, 3, 7, 8, 9, 10, 11], "which": [0, 1, 2, 3, 6, 7, 9, 10, 12], "specifi": [0, 1, 2, 6, 7, 9, 10, 11], "name": [0, 1, 2, 3, 10, 11], "both": [0, 1, 2, 7, 12], "option": [0, 1, 2, 3, 6, 7, 8, 10, 12], "mai": [0, 2, 6, 7, 10], "have": [0, 1, 2, 3, 5, 6, 7, 8, 10, 12], "an": [0, 1, 2, 6, 8, 10], "alia": [0, 2, 7], "give": [0, 2], "new": [0, 2, 12], "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "support": [0, 7, 8], "some": [0, 1, 2, 3, 6, 7, 10], "make": [0, 1, 2, 5, 12], "chang": [0, 1, 2, 5, 10, 12], "data": [0, 1, 4, 6, 7, 10], "thei": [0, 1, 2, 7, 10], "ar": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 12], "These": [0, 1, 2, 6, 7, 8, 9], "clean": [0, 6], "harmon": 0, "The": [0, 1, 2, 3, 5, 7, 8, 9, 10, 12], "avail": [0, 1, 2, 3, 5, 7, 8, 9, 12], "list": [0, 1, 2, 3, 4, 8, 10, 11], "below": [0, 1, 2, 3, 8, 9, 10], "section": [0, 1, 2, 12], "By": [0, 2, 10], "default": [0, 1, 2, 7, 8, 10], "must": [0, 1, 2, 3, 8, 9, 11], "same": [0, 1, 2, 6, 7, 10], "With": [0, 9], "override_column_a": [0, 2, 3], "override_column_b": [0, 2, 3], "you": [0, 1, 2, 5, 10, 11, 12], "can": [0, 1, 2, 3, 5, 6, 7, 8, 10, 12], "differ": [0, 1, 2, 4, 6, 7], "either": [0, 1, 2, 6, 11], "A": [0, 1, 2, 3, 9, 10], "b": [0, 1, 2, 3, 10], "when": [0, 1, 2, 3, 7, 12], "do": [0, 1, 3, 10, 12], "thi": [0, 1, 2, 3, 5, 6, 7, 9, 10, 12], "appli": [0, 2, 3, 7, 12], "onli": [0, 1, 2, 7, 12], "non": 0, "overrid": [0, 2], "also": [0, 1, 2, 5, 6, 7, 9, 10, 12], "provid": [0, 2, 3, 6, 7, 9, 10], "override_transform": [0, 2], "describ": [0, 2, 10], "type": [0, 2, 3, 4, 7, 8, 10, 11, 12], "oper": [0, 2], "singl": [0, 2, 3, 10, 12], "output": [0, 1, 2, 3, 6, 7, 10, 12], "more": [0, 1, 2, 9, 10, 12], "than": [0, 1, 2, 8], "one": [0, 1, 2, 7], "order": [0, 2, 7], "so": [0, 1, 2, 5, 12], "anoth": [0, 1, 3, 7], "format": 0, "letter": 0, "t": [0, 1, 2, 3, 12], "u": 0, "repres": [0, 1, 2, 9, 10], "arbitrari": 0, "requir": [0, 1, 2, 3, 4, 7, 9, 10, 11], "addit": [0, 1, 2, 5, 6, 10], "vari": [0, 2, 3], "inform": [0, 1, 2, 10], "appear": [0, 1], "its": [0, 1, 6, 10], "suffix": 0, "mean": [0, 2], "two": [0, 1, 2, 3, 6, 7, 9, 10, 12], "link": [0, 1, 2, 4, 6, 8], "most": [0, 1, 7, 10], "independ": [0, 2], "For": [0, 1, 2, 7, 10, 12], "exampl": [0, 1, 2], "taken": [0, 1], "10": [0, 2, 5, 12], "year": [0, 1, 2, 3, 4], "apart": 0, "want": [0, 1, 2, 10, 12], "standard": [0, 1, 11], "ag": [0, 1, 2, 3], "variabl": [0, 1, 2, 12], "i": [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12], "compar": [0, 1, 2, 6, 7], "between": [0, 1, 2, 6, 7, 10, 12], "To": [0, 1, 5, 7, 10], "could": [0, 2], "creat": [0, 2, 6, 7, 9, 10, 11, 12], "age_at_dataset_b": 0, "ad": [0, 1, 2], "column_map": [0, 2, 7], "valu": [0, 1, 2, 3, 8, 9, 10, 11], "As": 0, "suppos": [0, 2], "record": [0, 1, 2, 6, 7], "person": [0, 1, 6], "": [0, 1, 2, 6, 7, 10, 11], "first": [0, 1, 2, 5, 7, 10, 11], "string": [0, 1, 2, 3, 7, 8, 10, 11], "In": [0, 1, 6, 10, 12], "call": 0, "namefrst": [0, 1, 2], "entir": [0, 2], "lowercas": 0, "first_nam": 0, "uppercas": 0, "follow": [0, 1, 6, 10, 11, 12], "configur": [0, 1, 6, 10, 12], "add": [0, 4], "_": [0, 1, 2, 3, 4, 8, 9, 10], "given": [0, 1, 2, 3, 8, 12], "numer": [0, 1], "11": [0, 2, 5, 9], "concat": 0, "concaten": [0, 1], "end": [0, 1, 2, 3, 11], "col": [0, 1], "togeth": [0, 1, 2], "take": [0, 1, 2, 3, 7, 10], "column_to_append": 0, "multipl": [0, 1, 2, 10], "time": [0, 2, 7, 10], "row": [0, 2, 3], "If": [0, 1, 2, 3, 7, 8, 10, 11], "automat": [0, 2, 5, 7], "convert": [0, 1, 2], "befor": [0, 1, 2, 3, 5, 7], "statefip": [0, 1, 2], "counti": [0, 1], "strip": [0, 7], "alphabet": 0, "charact": 0, "lower": [0, 1], "case": [0, 1, 2, 3, 6], "white": 0, "space": [0, 2, 3, 11], "start": [0, 11], "ration": 0, "word": [0, 4], "replac": [0, 1, 4], "sinc": [0, 2], "peopl": [0, 1, 6, 10], "raw": [0, 2, 7, 10], "censu": [0, 7, 12], "contain": [0, 1, 11], "lead": 0, "better": [0, 6], "match": [0, 1, 4, 6, 10, 11, 12], "remov": 0, "qmark": 0, "hyphen": 0, "punctuat": 0, "apostroph": 0, "altern": [0, 2], "surround": 0, "all": [0, 1, 2, 3, 7, 8, 9, 10], "them": [0, 1, 2, 7], "jr": [0, 2], "sr": [0, 2], "ii": [0, 2], "iii": [0, 2], "stop": 0, "last": [0, 1, 7, 9], "street": [0, 1], "avenu": [0, 11], "blvd": 0, "circl": 0, "court": 0, "road": 0, "prefix": 0, "like": [0, 2, 7, 10], "m": [0, 1], "mr": 0, "ah": 0, "chines": 0, "condens": 0, "whitespac": [0, 7], "leav": 0, "behind": 0, "arrai": [0, 2, 4, 8, 9], "namefrst_split": [0, 2], "namefrst_clean": [0, 2], "index": [0, 5], "select": [0, 1, 4, 6, 10, 12], "element": 0, "posit": [0, 1, 2, 4, 6], "second": [0, 1, 2, 11], "1": [0, 1, 2, 4, 7, 8, 9, 10, 12], "item": 0, "set": [0, 1, 2, 3, 6, 7, 10, 12], "Then": [0, 5], "0": [0, 1, 2, 7, 8, 9, 10, 12], "initi": [0, 1, 10], "probabl": [0, 2, 8], "middl": [0, 1], "namefrst_mid_init": [0, 1], "otherwis": [0, 1, 9, 12], "known": 0, "recod": 0, "birthyr": [0, 2], "clean_birthyr": [0, 2, 3], "9999": [0, 2, 9], "1999": [0, 2], "9998": 0, "divid": 0, "int": [0, 1, 2, 3, 8], "integ": [0, 1, 2, 9], "result": [0, 1, 6, 9, 10, 12], "instanc": [0, 8], "birthplac": [0, 2], "detail": [0, 2, 10], "version": [0, 5, 12], "gener": [0, 1, 4, 6, 7, 10], "least": [0, 1], "signific": 0, "digit": 0, "we": [0, 1, 10, 12], "simpli": [0, 2], "drop": [0, 2, 10], "100": [0, 2, 12], "round": [0, 2], "lowest": 0, "whole": [0, 6], "number": [0, 1, 2, 3, 7, 8, 10], "floor": 0, "function": [0, 1, 2, 6, 10], "bpl": [0, 1, 2], "bpl_root": 0, "condit": [0, 1, 2, 3, 4, 7], "logic": 0, "work": [0, 1, 2, 5, 7, 10, 12], "sql": [0, 1, 2, 3, 4, 7, 10], "express": [0, 1, 2], "claus": [0, 1], "if_valu": 0, "else_valu": 0, "race": [0, 1, 2, 9, 12], "ipum": [0, 6], "code": [0, 1, 2, 5], "categori": [0, 8], "get": [0, 1, 2, 10], "down": [0, 6, 12], "nearest": 0, "produc": [0, 10], "relat": [0, 1, 2], "hundr": 0, "300": 0, "child": [0, 8], "household": [0, 4, 6, 8, 10, 12], "head": 0, "301": 0, "302": 0, "adopt": 0, "303": 0, "step": [0, 1, 2, 6], "usual": [0, 7, 12], "need": [0, 1, 2, 7, 10, 12], "2": [0, 1, 2, 3, 7, 8, 11, 12], "spous": 0, "3": [0, 1, 2, 5, 7, 8, 9, 12], "4": [0, 1, 8], "law": 0, "5": [0, 1, 2, 8, 9, 10, 12], "parent": [0, 1, 11], "6": [0, 2, 8, 9, 12], "7": [0, 1, 2, 8, 12], "sibl": 0, "12": [0, 5], "relate_div_100": [0, 1, 2], "page": [1, 2, 10], "comparison_featur": [1, 2, 7], "along": 1, "header": [1, 2, 9, 11], "context": [1, 9], "relatematch": [1, 2], "comparison_typ": [1, 2], "categor": [1, 2, 8, 9], "true": [1, 2, 3, 7, 9, 11, 12], "maximum": [1, 8], "jaro": [1, 9], "winkler": [1, 9], "find": [1, 7, 12], "greatest": 1, "among": 1, "cartesian": 1, "product": [1, 6, 12], "column": [1, 3, 4, 7, 9, 10, 11, 12], "namelast": [1, 2], "would": [1, 2, 12], "return": [1, 3, 8, 10], "four": 1, "namefrst_a": 1, "namefrst_b": 1, "namelast_b": 1, "namelast_a": 1, "maximum_jw": 1, "score": [1, 2, 7, 9], "namefrst_jw": [1, 2, 12], "geograph": 1, "filter": [1, 4, 7, 11], "major": [1, 10], "locat": [1, 2, 10], "boundari": 1, "zero": 1, "jw_street": 1, "enum_dist": 1, "max": [1, 8, 10], "member": [1, 7], "neighborhood": 1, "surnam": 1, "related_individual_max_jw": 1, "namefrst_rel": 1, "assert": [1, 10], "NOT": 1, "distinct": 1, "f1": 1, "evalu": [1, 2, 6, 7, 8], "ani": [1, 2, 3, 5, 8], "potenti": [1, 4, 7], "mismatch": 1, "queri": [1, 2], "fi": 1, "OR": [1, 2], "mi0": 1, "mi1": 1, "THEN": 1, "els": [1, 2, 3], "first_init_col": 1, "namefrst_init": 1, "mid_init_col": 1, "namefrst_mid_init_2": 1, "f2": 1, "empti": 1, "null": [1, 2, 3], "AND": [1, 2], "individu": [1, 2, 7, 12], "mainli": 1, "caution": [1, 9], "flag": [1, 9, 10, 12], "f": [1, 10], "sp": 1, "m_caution": [1, 2, 9, 12], "mbpl": 1, "mother_birthyr": 1, "stepmom": 1, "momloc": 1, "comp_a": [1, 2], "comp_b": [1, 2], "comp_c": 1, "parent_step_chang": 1, "comp_d": 1, "check": [1, 10], "sign": 1, "boolean": [1, 2, 3, 11, 12], "form": [1, 7, 11], "cast": 1, "namelast_equal_as_int": 1, "namelast_clean": [1, 2, 3], "whether": [1, 2, 11], "join": [1, 2, 11], "across": 1, "being": [1, 7], "exact": [1, 2], "namefrst_unstd": [1, 2], "present": [1, 2, 9], "nonzero": 1, "primarili": [1, 7], "indic": [1, 12], "kind": 1, "incompar": 1, "akin": 1, "miss": [1, 10], "see": [1, 2, 5, 10, 12], "univers": [1, 4, 7], "similar": 1, "fbpl_nomatch": 1, "fbpl": 1, "allow": [1, 2, 7, 12], "up": [1, 2, 10, 11], "sub": 1, "object": [1, 2, 6, 10], "document": [1, 8, 10, 12], "sp_caution": [1, 2, 12], "spouse_bpl": 1, "spouse_birthyr": 1, "durmarr": [1, 2], "new_marr": [1, 2], "street_jw": [1, 2, 12], "9": 1, "multipli": 1, "after": [1, 2, 4, 8, 10], "float": [1, 2, 8], "comp": 1, "c": 1, "sploc": 1, "012": 1, "fals": [1, 2, 3, 4, 6, 10], "d": 1, "under": [1, 2], "specif": [1, 2, 10], "circumst": 1, "should": [1, 2, 3, 8, 9, 10], "mid_init_match": 1, "either_1": 1, "nativ": 1, "either_0": 1, "gen": 1, "imm": [1, 2, 12], "immigr": 1, "look": [1, 10, 11], "foreign": 1, "born": 1, "sgen": [1, 2, 12], "rel": [1, 2, 12], "scala": 1, "determin": [1, 7], "greater": [1, 5], "jw_threshold": 1, "less": [1, 2], "age_threshold": 1, "sex": [1, 2, 11], "sampl": 1, "related_individual_row": 1, "unrel": 1, "depend": [1, 2, 5, 12], "name_col": 1, "birthyr_col": 1, "namefrst_related_row": 1, "replaced_birthyr": [1, 2, 3], "extra": 1, "children": 1, "who": 1, "base": [1, 2, 7], "expect": 1, "count": [1, 10, 12], "suspect": [1, 6], "relate_col": 1, "histid_col": 1, "id": [1, 2], "birth": 1, "year_b": 1, "wa": [1, 12], "minimum": [1, 8], "accept": [1, 2, 12], "consid": [1, 8], "histid": [1, 2, 12], "1910": [1, 2, 12], "8": [1, 2, 5, 10], "rate": 1, "calcul": [1, 12], "percentag": 1, "seen": 1, "neighbor": 1, "meet": 1, "95": 1, "nbor": [1, 2, 12], "namelast_neighbor": 1, "05": [1, 2], "namelast_popularity_sum": 1, "namelast_popular": 1, "length": [1, 2, 9], "size": 1, "ab": 1, "diff": 1, "absolut": 1, "invalid": [1, 8], "instead": [1, 2, 3, 5, 7], "marriag": 1, "durat": 1, "99": [1, 2], "placehold": 1, "unknown": 1, "exclud": 1, "those": [1, 2], "consider": 1, "byrdiff": [1, 2, 12], "mardurmatch": [1, 2], "14": 1, "minu": [1, 2], "subtract": 1, "geo": 1, "distanc": [1, 8], "lookup": 1, "tabl": [1, 2, 4, 7, 10, 12], "core": [1, 7, 10, 12], "dist_tabl": 1, "py": [1, 2], "There": [1, 2, 7], "sever": [1, 6], "wai": [1, 5, 10], "file": [1, 4, 6, 7, 10, 11, 12], "kei": [1, 7, 10], "key_count": 1, "secondari": 1, "serv": 1, "back": 1, "primari": [1, 6], "doe": [1, 3, 7, 12], "particularli": 1, "state": [1, 6], "much": [1, 7], "fewer": [1, 8], "combin": [1, 2, 3, 7], "thu": 1, "risk": 1, "fill": 1, "aren": 1, "ex": 1, "just": [1, 2, 10, 12], "even": 1, "though": 1, "distances_fil": 1, "path": [1, 2, 10, 11, 12], "table_nam": 1, "what": [1, 2, 10, 12], "onc": [1, 10], "loc_a": 1, "where": [1, 7, 10, 12], "come": 1, "loc_b": 1, "distance_col": 1, "source_column_a": 1, "sourc": [1, 4, 7, 10, 12], "source_column_b": 1, "loc_a_0": 1, "loc_a_1": 1, "loc_b_0": 1, "loc_b_1": 1, "secondary_key_count": 1, "backup": 1, "secondary_table_nam": 1, "secondary_distances_fil": 1, "secondary_source_column": 1, "secondary_loc_a": 1, "secondary_loc_b": 1, "secondary_distance_col": 1, "state_dist": 1, "state_distance_lookup": 1, "county_state_dist": 1, "csv": [1, 2, 7, 10, 11, 12], "statecode1": 1, "statecode2": 1, "dist": 1, "county_dist": [1, 2, 12], "county_distance_lookup": 1, "county_1900_1910_distances_km": 1, "from_icpsrctyi": 1, "to_icpsrctyi": 1, "from_statefip": 1, "to_statefip": 1, "distance_km": 1, "state_1900_1910_distances_km": 1, "fetch": 1, "neither": 1, "nor": 1, "mpre": 1, "m_namefrst": 1, "accord": 1, "niu": 1, "other": [1, 2, 12], "mfbplmatch": 1, "multi": 1, "search": 1, "special": 1, "simplifi": 1, "particular": [1, 2], "constraint": 1, "num_col": 1, "whose": 1, "templat": 1, "n": [1, 8, 9], "per": [1, 2, 8, 9, 10], "current": [1, 2, 10], "respect": [1, 7], "jw_col_templ": 1, "jw": 1, "pair": [1, 12], "equal_and_not_null_templ": 1, "final": [1, 2, 12], "comput": [1, 3, 7], "_namefrst": 1, "_bpl": 1, "_sex": 1, "25": 1, "nvl": 1, "sm_namefrst": 1, "sn_namefrst": 1, "sm_bpl": 1, "sn_bpl": 1, "sm_sex": 1, "sn_sex": 1, "pass": [1, 2, 7, 8], "flexibl": 1, "user": [1, 10], "write": [1, 10, 12], "own": [1, 2], "favor": 1, "reason": 1, "good": 1, "fallback": 1, "defin": [1, 7, 8, 9, 10], "spark": [1, 2, 3, 5, 8, 9, 10, 12], "builtin": 1, "argument": [1, 10, 12], "namelast_jw_max": 1, "namelast1": 1, "namelast2": 1, "namelast3": 1, "abov": [1, 5], "extend": 1, "beyond": 1, "top": [1, 4], "level": [1, 4, 10], "everi": [1, 3], "jw_f": [1, 2, 12], "father_namefrst": 1, "rais": [1, 3], "exponenti": 1, "squar": 1, "county_distance_squar": [1, 2, 12], "county_a": 1, "county_b": 1, "upper": 1, "gt": 1, "btwn": 1, "addl": 1, "var": [1, 2], "program": [1, 2, 7, 12], "report": [1, 4, 6, 10], "addl_var": 1, "check_val_expr": 1, "else_v": 1, "volumn": 1, "datasourc": [1, 2, 10], "yrimmig": 1, "immyear_diff": [1, 2, 9, 12], "includ": [1, 2, 7, 9, 10], "train": [1, 4, 6, 8, 10], "independent_var": [1, 2, 12], "config": [1, 4, 7, 10, 12], "id_column": [1, 2], "_a": 1, "mult": 1, "exist": [1, 2, 10], "within": [1, 2, 6, 10, 11], "hh_train": [1, 2, 7, 10, 12], "hh": 1, "highest": [1, 2], "against": [1, 11], "ten": [1, 2], "tell": [2, 3], "how": [2, 7], "descript": [2, 8, 10], "refer": 2, "here": [2, 7, 10, 12], "tutori": [2, 10], "script": [2, 6, 10], "discuss": 2, "readm": 2, "note": 2, "written": [2, 6], "toml": [2, 6, 10], "abl": 2, "json": [2, 10], "datasource_a": [2, 7], "datasource_b": [2, 7], "transform": [2, 4, 6, 7], "lowercase_strip": 2, "add_to_a": 2, "age_2": 2, "derived_from": 2, "expand_length": 2, "explod": [2, 7], "jaro_winkl": 2, "namelast_jw": [2, 12], "threshold": [2, 8, 12], "feature_nam": 2, "79": 2, "84": 2, "complex": [2, 3], "machin": [2, 6, 7, 10, 12], "learn": [2, 6, 7, 10, 12], "probabilist": [2, 6], "drop_data_from_scored_match": 2, "us1900": 2, "us1900m_usa": 2, "p": 2, "parquet": [2, 7], "us1910": 2, "us1910m_usa": 2, "training_data_subset": 2, "serialp": 2, "rationalize_name_word": 2, "remove_qmark_hyphen": 2, "replace_apostroph": 2, "remove_suffix": 2, "remove_alternate_nam": 2, "condense_strip_whitespac": 2, "split": [2, 3, 7, 8, 9, 12], "namefrst_std": [2, 11], "array_index": 2, "bpl_orig": 2, "divide_by_int": 2, "get_floor": 2, "statefip_h": 2, "output_typ": 2, "substitution_column": [2, 7, 11], "join_column": [2, 11], "join_valu": [2, 11], "substitution_fil": [2, 11], "name_std": [2, 11], "male": [2, 11], "femal": [2, 11], "feature_select": [2, 3, 7], "input_column": [2, 3, 9], "output_column": [2, 3, 9], "sql_condit": 2, "namelast_bigram": 2, "bigram": [2, 4], "bpl_clean": 2, "bpl_str": 2, "washington": 2, "bpl2_str": 2, "53": 2, "region": [2, 12], "attach_vari": 2, "region_dict": 2, "col_to_join_on": 2, "col_to_add": 2, "null_fil": 2, "col_typ": 2, "potential_matches_univers": [2, 7], "birthyr_3": 2, "namefrst_std_jw": [2, 12], "75": [2, 8, 12], "comparis": 2, "post": [2, 7], "hh_comparison": [2, 7], "threshold_expr": 2, "fetch_a": 2, "sex_equ": 2, "equal": [2, 11], "relate_a": [2, 9], "pipeline_featur": [2, 7, 9], "sex_region_interact": 2, "transformer_typ": [2, 9], "interact": [2, 4, 7, 12], "relatetyp": [2, 9], "bucket": [2, 7], "hit": [2, 10, 12], "scale_data": [2, 12], "training_data": [2, 10], "dependent_var": [2, 12], "score_with_model": [2, 12], "use_training_data_featur": [2, 7, 12], "split_by_id_a": [2, 12], "decis": [2, 4, 8, 12], "drop_duplicate_with_threshold_ratio": [2, 12], "n_training_iter": [2, 7, 12], "output_suspicious_td": [2, 12], "param_grid": [2, 12], "model_paramet": [2, 7, 8, 12], "random_forest": [2, 12], "maxdepth": [2, 8, 12], "numtre": [2, 8, 12], "005": 2, "threshold_ratio": [2, 8, 12], "logistic_regress": [2, 12], "50": [2, 12], "65": 2, "80": 2, "chosen_model": [2, 8, 12], "prediction_col": 2, "predict": [2, 12], "hh_col": 2, "hh_training_data_1900_1910": 2, "probit": [2, 4], "go": [2, 10], "your": [2, 5, 7, 10, 12], "uniqu": 2, "identifi": [2, 6, 12], "full": [2, 7, 12], "short": 2, "alphanumer": 2, "convert_ints_to_long": 2, "long": [2, 11], "especi": 2, "assum": 2, "schema": 2, "sometim": 2, "term": 2, "bigint": 2, "thing": 2, "my_fil": 2, "subset": [2, 11], "limit": 2, "extract": 2, "modifi": 2, "meant": 2, "usag": [2, 3, 4, 10], "set_value_column_a": [2, 3], "liter": 2, "set_value_column_b": [2, 3], "iv": 2, "v": 2, "vi": 2, "vii": 2, "viii": 2, "namelast_clean_bigram": [2, 3], "fed": [2, 7], "prep": 2, "df": [2, 10], "men": 2, "newli": 2, "attempt": 2, "duplic": [2, 8], "conjuct": 2, "Will": 2, "conjunct": 2, "rang": [2, 9], "original_valu": 2, "plu": 2, "1870": 2, "expand": 2, "1867": 2, "1868": 2, "1869": 2, "1871": 2, "1872": 2, "1873": 2, "kept": 2, "keep": 2, "appropri": 2, "treat": [2, 9], "import": [2, 7, 10, 12], "dure": [2, 7], "hot": 2, "encod": [2, 3], "vector": [2, 9], "stage": 2, "well": 2, "upper_threshold": 2, "cannot": 2, "robust": 2, "ml": [2, 4, 8, 9], "typic": [2, 7], "leverag": 2, "api": [2, 6, 9], "piplin": 2, "regionf": 2, "sex_regionf_interact": 2, "immyear_caut": [2, 9], "myriad": 2, "explor": [2, 4, 6, 10], "part": [2, 7], "task": [2, 4, 6, 8, 12], "drop_duplicate_a": 2, "out": [2, 7, 12], "best": [2, 7], "smallest": 2, "possibl": 2, "ratio": [2, 8], "beta": [2, 8], "test": [2, 7, 12], "model_explor": [2, 10, 12], "hyper": [2, 6, 12], "paramet": [2, 6, 7, 8, 10, 12], "eval": 2, "skip": [2, 7], "apply_model": 2, "run_all_step": [2, 10, 12], "command": [2, 6, 10, 12], "try": 2, "creation": 2, "iter": 2, "scale": 2, "error": [2, 9], "1900": [2, 12], "about": [2, 10, 12], "1930": [2, 12], "1940": [2, 12], "fail": 2, "were": 2, "sure": [2, 5, 10], "scratch": 2, "although": 2, "know": 2, "haven": 2, "save": [2, 7, 12], "small": 2, "amount": 2, "process": [2, 6, 10], "repeatedli": 2, "help": [2, 7, 10], "neg": [2, 4, 6], "area": 2, "coverag": 2, "increas": [2, 9], "represent": [2, 7], "ensur": 2, "group": [2, 7], "a304bt": 2, "three": [2, 7], "b200": 2, "c201": 2, "d425": 2, "perform": [2, 6, 7, 11], "feature_import": [2, 7, 12], "coeffici": [2, 7], "enabl": [2, 7, 10], "srace": [2, 9, 12], "race_interacted_srac": [2, 9, 12], "hits2": [2, 12], "exact_mult": [2, 12], "ncount": [2, 3, 12], "ncount2": [2, 3, 12], "f_interacted_jw_f": [2, 12], "f_caution": [2, 12], "f_pre": [2, 12], "fbplmatch": [2, 12], "m_interacted_jw_m": [2, 9, 12], "jw_m": [2, 9, 12], "m_pre": [2, 9, 12], "mbplmatch": [2, 12], "sp_interacted_jw_sp": [2, 12], "jw_sp": [2, 12], "sp_pre": [2, 12], "mi": [2, 12], "fsoundex": [2, 12], "lsoundex": [2, 12], "oth": [2, 12], "imm_interacted_immyear_caut": [2, 12], "1900_1910_training_data_20191023": 2, "jw_max_a": 2, "jw_max_b": 2, "f1_match": 2, "f2_match": 2, "byrdifcat": 2, "racematch": 2, "bplmatch": 2, "imm_interacted_bplmatch": 2, "sexmatch": 2, "relatetype_interacted_relatematch": 2, "checkpoint": 3, "no_first_pad": 3, "don": [2, 3], "prepend": 3, "namefrst_unstd_bigram": 3, "namelast_frst_bigram": 3, "namelast_clean_soundex": 3, "input_col": 3, "output_col": 3, "expon": 3, "introduct": 4, "overview": 4, "instal": 4, "pypi": 4, "preprocess": [4, 6, 10, 12], "model": [4, 6, 10], "run": [4, 5, 6, 7, 12], "librari": [4, 6], "mode": [4, 5, 12], "advanc": 4, "workflow": [3, 4], "export": [4, 7, 10], "featur": [4, 6, 7, 8, 10], "reus": 4, "basic": 4, "map": [4, 7, 9], "substitut": [4, 7], "block": [4, 7], "comparison": [4, 7], "pipelin": 4, "ons": 4, "aggreg": 4, "union": 4, "soundex": 4, "power": 4, "regex": 4, "random": [4, 8], "forest": [4, 8], "logist": [4, 8], "regress": [4, 8], "tree": [4, 8], "gradient": [4, 8], "boost": [4, 8], "system": 5, "python": [5, 6, 10], "java": 5, "integr": 5, "apach": 5, "via": [5, 6], "pyspark": [5, 8, 9, 10], "packag": 5, "org": 5, "latest": 5, "pip": 5, "easiest": [5, 10], "through": [5, 7, 9, 10], "instruct": [5, 10], "But": [2, 5], "clone": 5, "github": 5, "repositori": 5, "root": 5, "project": 5, "directori": [5, 10, 12], "develop": [5, 6], "e": 5, "dev": 5, "edit": 5, "made": 5, "built": 5, "tool": [5, 6], "line": [6, 10], "share": 6, "characterist": [6, 7], "correspond": [6, 7], "real": 6, "world": 6, "determinist": [6, 7], "rule": [6, 7], "algorithm": [6, 7], "At": [6, 7], "been": 6, "unit": 6, "census": 6, "hierarch": [6, 10], "structur": 6, "nest": 6, "howev": [3, 6, 12], "tailor": 6, "ignor": 6, "common": [6, 7, 12], "highli": [6, 7], "languag": 6, "further": [6, 12], "broken": 6, "smaller": 6, "sequenc": 6, "linkrun": [6, 10], "prepar": [6, 7, 10], "research": 6, "experi": 6, "understand": 6, "tune": [6, 12], "relationship": 6, "varieti": 7, "normal": 7, "abbrevi": [7, 11], "regist": [7, 10], "datafram": [7, 10, 12], "request": 7, "classif": [7, 8], "metadata": 7, "introspect": 7, "ingest": 7, "inspect": 7, "mani": [7, 10], "aspect": [7, 10], "extens": 7, "longest": 7, "definit": 7, "reduc": [3, 7], "drastic": 7, "improv": 7, "runtim": 7, "separ": 7, "total": 7, "potential_match": [7, 10], "satisfi": 7, "elig": 7, "reshap": 7, "thought": 7, "ahead": 7, "chosen": 7, "experiment": [7, 10], "focus": 7, "demograph": 7, "moment": 7, "veri": [3, 7, 12], "anyon": 7, "percent": 7, "remain": 7, "popul": 7, "pull": 7, "fix": 7, "width": 7, "crosswalk": 7, "construct": 7, "alpha": 8, "hyperparamet": [8, 12], "de": 8, "param": [8, 12], "label": 8, "doc": [8, 9], "commonli": 8, "explan": 8, "randomforestclassifi": 8, "depth": 8, "20": 8, "featuresubsetstrategi": 8, "node": 8, "auto": 8, "onethird": 8, "sqrt": 8, "log2": 8, "15": 8, "generalizedlinearregress": 8, "famili": 8, "binomi": 8, "85": [8, 10], "logisticregress": 8, "decisiontreeclassifi": 8, "mininstancespernod": 8, "caus": 8, "left": 8, "right": [8, 10], "discard": 8, "maxbin": 8, "bin": 8, "discret": 8, "continu": [8, 9, 12], "gbtclassifi": 8, "mother": 9, "point": [9, 12], "x": [9, 10], "y": 9, "hold": 9, "except": 9, "strictli": 9, "inf": 9, "explicitli": 9, "cover": 9, "doubl": 9, "outsid": 9, "job": 10, "high": 10, "class": 10, "handl": 10, "main": 10, "complet": 10, "access": [10, 12], "link_run": 10, "factori": 10, "sparkfactori": 10, "load_config": 10, "load_conf_fil": 10, "sparksess": 10, "now": 10, "let": 10, "load": 10, "our": 10, "my_conf": 10, "lr": 10, "prep_step": 10, "get_step": 10, "enumer": 10, "print": 10, "input_table_nam": 10, "output_table_nam": 10, "run_step": 10, "get_tabl": 10, "matches_df": 10, "hh_model_explor": 10, "method": [10, 12], "interfac": 10, "easili": 10, "conveni": 10, "adjust": 10, "set_loc": 10, "set_num_cor": 10, "set_executor_memori": 10, "5g": 10, "ll": 10, "dictionari": 10, "often": [3, 10], "modul": 10, "pleas": 10, "reproduc": 10, "consol": 10, "cpu": 10, "h": 10, "executor_memori": [10, 12], "execute_task": 10, "execute_command": 10, "conf": [10, 12], "show": 10, "messag": 10, "exit": 10, "memori": 10, "executor": 10, "begin": 10, "execut": 10, "seri": 10, "excute_command": 10, "filepath": 10, "sai": 10, "fullcount_1870_1880": 10, "pattern": 10, "full_count_1870_1880": 10, "prompt": 10, "enter": 10, "text": 10, "unstabl": 10, "topic": 10, "analyz": [10, 12], "set_preexisting_t": 10, "x_persist": 10, "borrow_t": 10, "get_task": 10, "set_print_sql": 10, "x_sql": 10, "x_sqlf": 10, "ipython": 10, "showf": 10, "x_summari": 10, "desc": 10, "x_crosswalk": 10, "x_tab": 10, "q": [10, 12], "x_hh_tfam": 10, "x_tfam": 10, "drop_al": 10, "reload": 10, "x_hh_tfam_2a": 10, "x_tfam_raw": 10, "drop_all_prc": 10, "x_hh_tfam_2b": 10, "x_union": 10, "drop_all_temp": 10, "x_load": 10, "get_set": 10, "set_link_task": 10, "x_parquet_from_csv": 10, "organ": 10, "hierarchi": 10, "five": 10, "hh_match": 10, "someth": 10, "choic": 10, "preexist": 10, "prepped_df_a": 10, "prepped_df_b": 10, "raw_df_b": 10, "raw_df_a": 10, "training_featur": [10, 12], "scored_potential_match": 10, "potential_matches_prep": 10, "exploded_df_b": 10, "exploded_df_a": 10, "predicted_match": 10, "hh_training_featur": [10, 12], "hh_training_data": 10, "hh_predicted_match": 10, "hh_scored_potential_match": 10, "hh_potential_match": 10, "hh_blocked_match": 10, "hh_potential_matchs_prep": 10, "model_eval_training_vector": 10, "model_eval_training_data": 10, "model_eval_repeat_fp": 10, "model_eval_training_featur": 10, "model_eval_training_result": 10, "model_eval_repeat_fn": 10, "hh_model_eval_training_vector": 10, "hh_model_eval_repeat_fp": 10, "hh_model_eval_repeat_fn": 10, "hh_model_eval_training_result": 10, "hh_model_eval_training_featur": 10, "hh_model_eval_training_data": 10, "persist": 10, "hidden": 10, "intermedi": 10, "yet": 10, "databas": 10, "tablenam": 10, "istemporari": 10, "task_nam": 10, "num": 10, "finish": 10, "put": [10, 12], "launch": [10, 12], "my": [10, 12], "subhead": 11, "suppli": 11, "regex_word_replac": 11, "variant": 11, "av": 11, "7th": 11, "swap": 11, "still": 11, "anywher": 11, "proceed": 11, "street_unstd": 11, "dir": 11, "substitutions_street_abbrev": 11, "span": 12, "1920": 12, "deriv": 12, "necessari": [3, 12], "scenario": 12, "copi": [3, 12], "use_potential_matches_featur": 12, "full_count_1900_1910": 12, "50g": 12, "ask": 12, "arg": 12, "partit": 12, "training_data_1900_1910_hlink_featur": 12, "might": 12, "shut": 12, "framework": 12, "etc": 12, "relev": 12, "matrix": 12, "implement": 12, "regular": 12, "training_data_1900_1910": 12, "weren": 12, "ident": 12, "manual": 12, "updat": 12, "isn": 12, "analysi": 12, "training_result": 12, "hh_training_result": 12, "1900_1910_training_result": 12, "repeat_fp": 12, "repeat_fn": 12, "hh_repeat_fp": 12, "hh_repeat_fn": 12, "1900_1910_potential_fp": 12, "1900_1910_potential_fn": 12, "prefer": 12, "ve": 12, "or_group": 2, "belong": 2, "bpl1": 2, "bpl2": 2, "bpl3": 2, "parenthes": 2, "around": 2, "connect": 2, "few": 3, "util": 3, "resourc": 3, "affect": 3}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"column": [0, 2], "map": [0, 2], "basic": [0, 2], "usag": 0, "advanc": [0, 2, 12], "transform": [0, 1, 3, 9], "add_to_a": 0, "concat_to_a": 0, "concat_to_b": 0, "concat_two_col": 0, "lowercase_strip": 0, "rationalize_name_word": 0, "remove_qmark_hyphen": 0, "remove_punctu": 0, "replace_apostroph": 0, "remove_alternate_nam": 0, "remove_suffix": 0, "remove_stop_word": 0, "remove_prefix": 0, "condense_strip_whitespac": 0, "remove_one_letter_nam": 0, "split": 0, "array_index": 0, "substr": 0, "divide_by_int": 0, "when_valu": 0, "get_floor": 0, "comparison": [1, 2], "type": [1, 9], "add": 1, "ons": 1, "aggreg": 1, "featur": [1, 2, 3, 9, 12], "household": [1, 2, 7], "maximum_jaro_winkl": 1, "jaro_winkl": 1, "jaro_winkler_street": 1, "max_jaro_winkl": 1, "equal": 1, "f1_match": 1, "f2_match": 1, "not_equ": 1, "equals_as_int": 1, "all_equ": 1, "not_zero_and_not_equ": 1, "time": 1, "caution_comp_3": 1, "caution_comp_3_012": 1, "caution_comp_4": 1, "caution_comp_4_012": 1, "any_equ": 1, "either_are_1": 1, "either_are_0": 1, "second_gen_imm": 1, "rel_jaro_winkl": 1, "extra_children": 1, "jaro_winkler_r": 1, "sum": 1, "length_b": 1, "abs_diff": 1, "b_minus_a": 1, "geo_dist": 1, "fetch_a": 1, "fetch_b": 1, "present_both_year": 1, "neither_are_nul": 1, "present_and_matching_categor": 1, "present_and_not_equ": 1, "present_and_equal_categorical_in_univers": 1, "multi_jaro_winkler_search": 1, "sql_condit": [1, 3], "alia": 1, "power": [1, 3], "threshold": 1, "lower_threshold": 1, "upper_threshold": 1, "gt_threshold": 1, "btwn_threshold": 1, "look_at_addl_var": 1, "hit": 1, "hits2": 1, "exact_mult": 1, "jw_max_a": 1, "jw_max_b": 1, "configur": [2, 4, 7], "config": 2, "file": 2, "top": 2, "level": 2, "data": [2, 11, 12], "sourc": [2, 5], "filter": 2, "substitut": [2, 11], "select": [2, 3], "potenti": [2, 12], "match": [2, 7], "univers": 2, "block": 2, "pipelin": [2, 9], "gener": [2, 9, 12], "train": [2, 7, 12], "model": [2, 7, 8, 12], "bigram": 3, "arrai": 3, "union": 3, "soundex": 3, "welcom": 4, "hlink": [4, 10], "": 4, "document": 4, "api": 4, "instal": 5, "requir": 5, "from": 5, "pypi": 5, "introduct": 6, "overview": [6, 7], "link": [7, 10, 12], "task": [7, 10], "preprocess": 7, "step": [7, 10], "relat": 7, "section": 7, "explor": [7, 12], "report": 7, "random_forest": 8, "probit": 8, "logistic_regress": 8, "decision_tre": 8, "gradient_boosted_tre": 8, "interact": [9, 10], "bucket": 9, "run": 10, "us": 10, "librari": 10, "mode": 10, "start": 10, "program": 10, "exampl": [10, 12], "workflow": [10, 12], "1": 11, "tabl": 11, "regex": 11, "word": 11, "replac": 11, "export": 12, "after": 12, "reus": 12, "differ": 12, "year": 12, "ml": 12, "list": 12, "fals": 12, "posit": 12, "neg": 12, "fp": 12, "fn": 12}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 60}, "alltitles": {"Column Mappings": [[0, "column-mappings"], [2, "column-mappings"]], "Basic Usage": [[0, "basic-usage"]], "Advanced Usage": [[0, "advanced-usage"]], "Transforms": [[0, "transforms"]], "add_to_a": [[0, "add-to-a"]], "concat_to_a": [[0, "concat-to-a"]], "concat_to_b": [[0, "concat-to-b"]], "concat_two_cols": [[0, "concat-two-cols"]], "lowercase_strip": [[0, "lowercase-strip"]], "rationalize_name_words": [[0, "rationalize-name-words"]], "remove_qmark_hyphen": [[0, "remove-qmark-hyphen"]], "remove_punctuation": [[0, "remove-punctuation"]], "replace_apostrophe": [[0, "replace-apostrophe"]], "remove_alternate_names": [[0, "remove-alternate-names"]], "remove_suffixes": [[0, "remove-suffixes"]], "remove_stop_words": [[0, "remove-stop-words"]], "remove_prefixes": [[0, "remove-prefixes"]], "condense_strip_whitespace": [[0, "condense-strip-whitespace"]], "remove_one_letter_names": [[0, "remove-one-letter-names"]], "split": [[0, "split"]], "array_index": [[0, "array-index"]], "mapping": [[0, "mapping"]], "substring": [[0, "substring"]], "divide_by_int": [[0, "divide-by-int"]], "when_value": [[0, "when-value"]], "get_floor": [[0, "get-floor"]], "Comparison types, transform add-ons, aggregate features, and household aggregate features": [[1, "comparison-types-transform-add-ons-aggregate-features-and-household-aggregate-features"]], "Comparison types": [[1, "comparison-types"]], "maximum_jaro_winkler": [[1, "maximum-jaro-winkler"]], "jaro_winkler": [[1, "jaro-winkler"]], "jaro_winkler_street": [[1, "jaro-winkler-street"]], "max_jaro_winkler": [[1, "max-jaro-winkler"]], "equals": [[1, "equals"]], "f1_match": [[1, "f1-match"]], "f2_match": [[1, "f2-match"]], "not_equals": [[1, "not-equals"]], "equals_as_int": [[1, "equals-as-int"]], "all_equals": [[1, "all-equals"]], "not_zero_and_not_equals": [[1, "not-zero-and-not-equals"]], "or": [[1, "or"]], "and": [[1, "and"]], "times": [[1, "times"]], "caution_comp_3": [[1, "caution-comp-3"]], "caution_comp_3_012": [[1, "caution-comp-3-012"]], "caution_comp_4": [[1, "caution-comp-4"]], "caution_comp_4_012": [[1, "caution-comp-4-012"]], "any_equals": [[1, "any-equals"]], "either_are_1": [[1, "either-are-1"]], "either_are_0": [[1, "either-are-0"]], "second_gen_imm": [[1, "second-gen-imm"]], "rel_jaro_winkler": [[1, "rel-jaro-winkler"]], "extra_children": [[1, "extra-children"]], "jaro_winkler_rate": [[1, "jaro-winkler-rate"]], "sum": [[1, "sum"]], "length_b": [[1, "length-b"]], "abs_diff": [[1, "abs-diff"]], "b_minus_a": [[1, "b-minus-a"]], "geo_distance": [[1, "geo-distance"]], "fetch_a": [[1, "fetch-a"]], "fetch_b": [[1, "fetch-b"]], "present_both_years": [[1, "present-both-years"]], "neither_are_null": [[1, "neither-are-null"]], "present_and_matching_categorical": [[1, "present-and-matching-categorical"]], "present_and_not_equal": [[1, "present-and-not-equal"]], "present_and_equal_categorical_in_universe": [[1, "present-and-equal-categorical-in-universe"]], "multi_jaro_winkler_search": [[1, "multi-jaro-winkler-search"]], "sql_condition": [[1, "sql-condition"], [3, "sql-condition"]], "Feature add-ons": [[1, "feature-add-ons"]], "alias": [[1, "alias"]], "power": [[1, "power"], [3, "power"]], "threshold": [[1, "threshold"]], "lower_threshold": [[1, "lower-threshold"]], "upper_threshold": [[1, "upper-threshold"]], "gt_threshold": [[1, "gt-threshold"]], "btwn_threshold": [[1, "btwn-threshold"]], "look_at_addl_var": [[1, "look-at-addl-var"]], "Aggregate Features": [[1, "aggregate-features"]], "hits": [[1, "hits"]], "hits2": [[1, "hits2"]], "exact_mult": [[1, "exact-mult"]], "Household Aggregate Features": [[1, "household-aggregate-features"]], "jw_max_a": [[1, "jw-max-a"]], "jw_max_b": [[1, "jw-max-b"]], "Configuration": [[2, "configuration"]], "Basic Config File": [[2, "basic-config-file"]], "Advanced Config File": [[2, "advanced-config-file"]], "Top level configs": [[2, "top-level-configs"]], "Data sources": [[2, "data-sources"]], "Filter": [[2, "filter"]], "Substitution Columns": [[2, "substitution-columns"]], "Feature Selections": [[2, "feature-selections"]], "Potential Matches Universe": [[2, "potential-matches-universe"]], "Blocking": [[2, "blocking"]], "Comparisons": [[2, "comparisons"]], "Household Comparisons": [[2, "household-comparisons"]], "Comparison Features": [[2, "comparison-features"]], "Pipeline-generated Features": [[2, "pipeline-generated-features"]], "Training and models": [[2, "training-and-models"]], "Household training and models": [[2, "household-training-and-models"]], "Welcome to hlink\u2019s documentation!": [[4, "welcome-to-hlink-s-documentation"]], "Configuration API": [[4, "configuration-api"], [4, null]], "Installation": [[5, "installation"]], "Requirements": [[5, "requirements"]], "Installing from PyPI": [[5, "installing-from-pypi"]], "Installing from source": [[5, "installing-from-source"]], "Introduction": [[6, "introduction"]], "Overview": [[6, "overview"], [7, "overview"], [7, "id1"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"]], "Link Tasks": [[7, "link-tasks"]], "Preprocessing": [[7, "preprocessing"]], "Task steps": [[7, "task-steps"], [7, "id2"], [7, "id5"], [7, "id8"], [7, "id11"], [7, "id14"]], "Related Configuration Sections": [[7, "related-configuration-sections"], [7, "id3"], [7, "id6"], [7, "id9"], [7, "id12"], [7, "id15"]], "Training and Household Training": [[7, "training-and-household-training"]], "Matching": [[7, "matching"]], "Household Matching": [[7, "household-matching"]], "Model Exploration and Household Model Exploration": [[7, "model-exploration-and-household-model-exploration"]], "Reporting": [[7, "reporting"]], "Models": [[8, "models"]], "random_forest": [[8, "random-forest"]], "probit": [[8, "probit"]], "logistic_regression": [[8, "logistic-regression"]], "decision_tree": [[8, "decision-tree"]], "gradient_boosted_trees": [[8, "gradient-boosted-trees"]], "Pipeline generated features": [[9, "pipeline-generated-features"]], "Transformer types": [[9, "transformer-types"]], "interaction": [[9, "interaction"]], "bucketizer": [[9, "bucketizer"]], "Running hlink": [[10, "running-hlink"]], "Using hlink as a Library": [[10, "using-hlink-as-a-library"]], "Interactive Mode": [[10, "interactive-mode"]], "Starting the program": [[10, "starting-the-program"]], "Running Linking Tasks and Steps": [[10, "running-linking-tasks-and-steps"]], "Example interactive mode workflow": [[10, "example-interactive-mode-workflow"]], "Substitutions": [[11, "substitutions"]], "1:1 substitution by data table": [[11, "substitution-by-data-table"]], "Substitution by regex word replace": [[11, "substitution-by-regex-word-replace"]], "Advanced Workflow Examples": [[12, "advanced-workflow-examples"]], "Export training data after generating features to reuse in different linking years": [[12, "export-training-data-after-generating-features-to-reuse-in-different-linking-years"]], "Example training data export with generated ML features": [[12, "example-training-data-export-with-generated-ml-features"]], "ML model exploration and export of lists of potential false positives/negatives in training data": [[12, "ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data"]], "Example model exploration and FP/FN export workflow": [[12, "example-model-exploration-and-fp-fn-export-workflow"]], "Feature Selection Transforms": [[3, "feature-selection-transforms"]], "bigrams": [[3, "bigrams"]], "array": [[3, "array"]], "union": [[3, "union"]], "soundex": [[3, "soundex"]]}, "indexentries": {}}) \ No newline at end of file +Search.setIndex({"docnames": ["column_mappings", "comparison_types", "config", "feature_selection_transforms", "index", "installation", "introduction", "link_tasks", "models", "pipeline_features", "running_the_program", "substitutions", "use_examples"], "filenames": ["column_mappings.md", "comparison_types.md", "config.md", "feature_selection_transforms.md", "index.rst", "installation.md", "introduction.md", "link_tasks.md", "models.md", "pipeline_features.md", "running_the_program.md", "substitutions.md", "use_examples.md"], "titles": ["Column Mappings", "Comparison types, transform add-ons, aggregate features, and household aggregate features", "Configuration", "Feature Selection Transforms", "Welcome to hlink\u2019s documentation!", "Installation", "Introduction", "Link Tasks", "Models", "Pipeline generated features", "Running hlink", "Substitutions", "Advanced Workflow Examples"], "terms": {"each": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10], "read": [0, 1, 2, 7, 10], "from": [0, 1, 2, 4, 6, 7, 8, 9, 10, 12], "input": [0, 1, 2, 3, 6, 7, 10, 11], "dataset": [0, 1, 2, 3, 6, 7, 10, 12], "hlink": [0, 1, 2, 3, 5, 6, 7, 12], "It": [0, 1, 2, 6, 10, 12], "ha": [0, 1, 2, 3, 6, 10, 12], "column_nam": [0, 1, 2, 11], "attribut": [0, 1, 2, 3, 7, 8, 9, 10, 11], "which": [0, 1, 2, 3, 6, 7, 9, 10, 12], "specifi": [0, 1, 2, 6, 7, 9, 10, 11], "name": [0, 1, 2, 3, 10, 11], "both": [0, 1, 2, 7, 12], "option": [0, 1, 2, 3, 6, 7, 8, 10, 12], "mai": [0, 2, 6, 7, 10], "have": [0, 1, 2, 3, 5, 6, 7, 8, 10, 12], "an": [0, 1, 2, 6, 8, 10], "alia": [0, 2, 7], "give": [0, 2], "new": [0, 2, 12], "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "support": [0, 7, 8], "some": [0, 1, 2, 3, 6, 7, 10], "make": [0, 1, 2, 5, 12], "chang": [0, 1, 2, 5, 10, 12], "data": [0, 1, 4, 6, 7, 10], "thei": [0, 1, 2, 7, 10], "ar": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 12], "These": [0, 1, 2, 6, 7, 8, 9], "clean": [0, 6], "harmon": 0, "The": [0, 1, 2, 3, 5, 7, 8, 9, 10, 12], "avail": [0, 1, 2, 3, 5, 7, 8, 9, 12], "list": [0, 1, 2, 3, 4, 8, 10, 11], "below": [0, 1, 2, 3, 8, 9, 10], "section": [0, 1, 2, 12], "By": [0, 2, 10], "default": [0, 1, 2, 7, 8, 10], "must": [0, 1, 2, 3, 8, 9, 11], "same": [0, 1, 2, 3, 6, 7, 10], "With": [0, 9], "override_column_a": [0, 2, 3], "override_column_b": [0, 2, 3], "you": [0, 1, 2, 5, 10, 11, 12], "can": [0, 1, 2, 3, 5, 6, 7, 8, 10, 12], "differ": [0, 1, 2, 4, 6, 7], "either": [0, 1, 2, 6, 11], "A": [0, 1, 2, 3, 9, 10], "b": [0, 1, 2, 3, 10], "when": [0, 1, 2, 3, 7, 12], "do": [0, 1, 3, 10, 12], "thi": [0, 1, 2, 3, 5, 6, 7, 9, 10, 12], "appli": [0, 2, 3, 7, 12], "onli": [0, 1, 2, 7, 12], "non": 0, "overrid": [0, 2], "also": [0, 1, 2, 5, 6, 7, 9, 10, 12], "provid": [0, 2, 3, 6, 7, 9, 10], "override_transform": [0, 2], "describ": [0, 2, 10], "type": [0, 2, 3, 4, 7, 8, 10, 11, 12], "oper": [0, 2], "singl": [0, 2, 3, 10, 12], "output": [0, 1, 2, 3, 6, 7, 10, 12], "more": [0, 1, 2, 9, 10, 12], "than": [0, 1, 2, 8], "one": [0, 1, 2, 7], "order": [0, 2, 7], "so": [0, 1, 2, 5, 12], "anoth": [0, 1, 3, 7], "format": 0, "letter": 0, "t": [0, 1, 2, 3, 12], "u": 0, "repres": [0, 1, 2, 9, 10], "arbitrari": 0, "requir": [0, 1, 2, 3, 4, 7, 9, 10, 11], "addit": [0, 1, 2, 5, 6, 10], "vari": [0, 2, 3], "inform": [0, 1, 2, 10], "appear": [0, 1], "its": [0, 1, 6, 10], "suffix": 0, "mean": [0, 2], "two": [0, 1, 2, 3, 6, 7, 9, 10, 12], "link": [0, 1, 2, 4, 6, 8], "most": [0, 1, 7, 10], "independ": [0, 2], "For": [0, 1, 2, 7, 10, 12], "exampl": [0, 1, 2], "taken": [0, 1], "10": [0, 2, 5, 12], "year": [0, 1, 2, 3, 4], "apart": 0, "want": [0, 1, 2, 10, 12], "standard": [0, 1, 11], "ag": [0, 1, 2, 3], "variabl": [0, 1, 2, 12], "i": [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12], "compar": [0, 1, 2, 6, 7], "between": [0, 1, 2, 6, 7, 10, 12], "To": [0, 1, 5, 7, 10], "could": [0, 2], "creat": [0, 2, 6, 7, 9, 10, 11, 12], "age_at_dataset_b": 0, "ad": [0, 1, 2], "column_map": [0, 2, 7], "valu": [0, 1, 2, 3, 8, 9, 10, 11], "As": 0, "suppos": [0, 2], "record": [0, 1, 2, 6, 7], "person": [0, 1, 6], "": [0, 1, 2, 6, 7, 10, 11], "first": [0, 1, 2, 5, 7, 10, 11], "string": [0, 1, 2, 3, 7, 8, 10, 11], "In": [0, 1, 6, 10, 12], "call": 0, "namefrst": [0, 1, 2], "entir": [0, 2], "lowercas": 0, "first_nam": 0, "uppercas": 0, "follow": [0, 1, 6, 10, 11, 12], "configur": [0, 1, 6, 10, 12], "add": [0, 4], "_": [0, 1, 2, 3, 4, 8, 9, 10], "given": [0, 1, 2, 3, 8, 12], "numer": [0, 1], "11": [0, 2, 5, 9], "concat": 0, "concaten": [0, 1], "end": [0, 1, 2, 3, 11], "col": [0, 1], "togeth": [0, 1, 2], "take": [0, 1, 2, 3, 7, 10], "column_to_append": 0, "multipl": [0, 1, 2, 10], "time": [0, 2, 7, 10], "row": [0, 2, 3], "If": [0, 1, 2, 3, 7, 8, 10, 11], "automat": [0, 2, 5, 7], "convert": [0, 1, 2], "befor": [0, 1, 2, 3, 5, 7], "statefip": [0, 1, 2], "counti": [0, 1], "strip": [0, 7], "alphabet": 0, "charact": 0, "lower": [0, 1], "case": [0, 1, 2, 3, 6], "white": 0, "space": [0, 2, 3, 11], "start": [0, 11], "ration": 0, "word": [0, 4], "replac": [0, 1, 4], "sinc": [0, 2], "peopl": [0, 1, 6, 10], "raw": [0, 2, 7, 10], "censu": [0, 7, 12], "contain": [0, 1, 11], "lead": 0, "better": [0, 6], "match": [0, 1, 4, 6, 10, 11, 12], "remov": 0, "qmark": 0, "hyphen": 0, "punctuat": 0, "apostroph": 0, "altern": [0, 2], "surround": 0, "all": [0, 1, 2, 3, 7, 8, 9, 10], "them": [0, 1, 2, 7], "jr": [0, 2], "sr": [0, 2], "ii": [0, 2], "iii": [0, 2], "stop": 0, "last": [0, 1, 7, 9], "street": [0, 1], "avenu": [0, 11], "blvd": 0, "circl": 0, "court": 0, "road": 0, "prefix": 0, "like": [0, 2, 7, 10], "m": [0, 1], "mr": 0, "ah": 0, "chines": 0, "condens": 0, "whitespac": [0, 7], "leav": 0, "behind": 0, "arrai": [0, 2, 4, 8, 9], "namefrst_split": [0, 2], "namefrst_clean": [0, 2], "index": [0, 5], "select": [0, 1, 4, 6, 10, 12], "element": 0, "posit": [0, 1, 2, 4, 6], "second": [0, 1, 2, 11], "1": [0, 1, 2, 4, 7, 8, 9, 10, 12], "item": 0, "set": [0, 1, 2, 3, 6, 7, 10, 12], "Then": [0, 5], "0": [0, 1, 2, 7, 8, 9, 10, 12], "initi": [0, 1, 10], "probabl": [0, 2, 8], "middl": [0, 1], "namefrst_mid_init": [0, 1], "otherwis": [0, 1, 9, 12], "known": 0, "recod": 0, "birthyr": [0, 2], "clean_birthyr": [0, 2, 3], "9999": [0, 2, 9], "1999": [0, 2], "9998": 0, "divid": 0, "int": [0, 1, 2, 3, 8], "integ": [0, 1, 2, 9], "result": [0, 1, 6, 9, 10, 12], "instanc": [0, 8], "birthplac": [0, 2], "detail": [0, 2, 10], "version": [0, 5, 12], "gener": [0, 1, 4, 6, 7, 10], "least": [0, 1], "signific": 0, "digit": 0, "we": [0, 1, 10, 12], "simpli": [0, 2], "drop": [0, 2, 10], "100": [0, 2, 12], "round": [0, 2], "lowest": 0, "whole": [0, 6], "number": [0, 1, 2, 3, 7, 8, 10], "floor": 0, "function": [0, 1, 2, 6, 10], "bpl": [0, 1, 2], "bpl_root": 0, "condit": [0, 1, 2, 3, 4, 7], "logic": 0, "work": [0, 1, 2, 5, 7, 10, 12], "sql": [0, 1, 2, 3, 4, 7, 10], "express": [0, 1, 2], "claus": [0, 1], "if_valu": 0, "else_valu": 0, "race": [0, 1, 2, 9, 12], "ipum": [0, 6], "code": [0, 1, 2, 5], "categori": [0, 8], "get": [0, 1, 2, 10], "down": [0, 6, 12], "nearest": 0, "produc": [0, 10], "relat": [0, 1, 2], "hundr": 0, "300": 0, "child": [0, 8], "household": [0, 4, 6, 8, 10, 12], "head": 0, "301": 0, "302": 0, "adopt": 0, "303": 0, "step": [0, 1, 2, 6], "usual": [0, 7, 12], "need": [0, 1, 2, 7, 10, 12], "2": [0, 1, 2, 3, 7, 8, 11, 12], "spous": 0, "3": [0, 1, 2, 5, 7, 8, 9, 12], "4": [0, 1, 8], "law": 0, "5": [0, 1, 2, 8, 9, 10, 12], "parent": [0, 1, 11], "6": [0, 2, 8, 9, 12], "7": [0, 1, 2, 8, 12], "sibl": 0, "12": [0, 5], "relate_div_100": [0, 1, 2], "page": [1, 2, 10], "comparison_featur": [1, 2, 7], "along": 1, "header": [1, 2, 9, 11], "context": [1, 9], "relatematch": [1, 2], "comparison_typ": [1, 2], "categor": [1, 2, 8, 9], "true": [1, 2, 3, 7, 9, 11, 12], "maximum": [1, 8], "jaro": [1, 9], "winkler": [1, 9], "find": [1, 7, 12], "greatest": 1, "among": 1, "cartesian": 1, "product": [1, 6, 12], "column": [1, 3, 4, 7, 9, 10, 11, 12], "namelast": [1, 2], "would": [1, 2, 12], "return": [1, 3, 8, 10], "four": 1, "namefrst_a": 1, "namefrst_b": 1, "namelast_b": 1, "namelast_a": 1, "maximum_jw": 1, "score": [1, 2, 7, 9], "namefrst_jw": [1, 2, 12], "geograph": 1, "filter": [1, 4, 7, 11], "major": [1, 10], "locat": [1, 2, 10], "boundari": 1, "zero": 1, "jw_street": 1, "enum_dist": 1, "max": [1, 8, 10], "member": [1, 7], "neighborhood": 1, "surnam": 1, "related_individual_max_jw": 1, "namefrst_rel": 1, "assert": [1, 10], "NOT": 1, "distinct": 1, "f1": 1, "evalu": [1, 2, 6, 7, 8], "ani": [1, 2, 3, 5, 8], "potenti": [1, 4, 7], "mismatch": 1, "queri": [1, 2], "fi": 1, "OR": [1, 2], "mi0": 1, "mi1": 1, "THEN": 1, "els": [1, 2, 3], "first_init_col": 1, "namefrst_init": 1, "mid_init_col": 1, "namefrst_mid_init_2": 1, "f2": 1, "empti": 1, "null": [1, 2, 3], "AND": [1, 2], "individu": [1, 2, 7, 12], "mainli": 1, "caution": [1, 9], "flag": [1, 9, 10, 12], "f": [1, 10], "sp": 1, "m_caution": [1, 2, 9, 12], "mbpl": 1, "mother_birthyr": 1, "stepmom": 1, "momloc": 1, "comp_a": [1, 2], "comp_b": [1, 2], "comp_c": 1, "parent_step_chang": 1, "comp_d": 1, "check": [1, 10], "sign": 1, "boolean": [1, 2, 3, 11, 12], "form": [1, 7, 11], "cast": 1, "namelast_equal_as_int": 1, "namelast_clean": [1, 2, 3], "whether": [1, 2, 11], "join": [1, 2, 11], "across": 1, "being": [1, 7], "exact": [1, 2], "namefrst_unstd": [1, 2], "present": [1, 2, 9], "nonzero": 1, "primarili": [1, 7], "indic": [1, 12], "kind": 1, "incompar": 1, "akin": 1, "miss": [1, 10], "see": [1, 2, 5, 10, 12], "univers": [1, 4, 7], "similar": 1, "fbpl_nomatch": 1, "fbpl": 1, "allow": [1, 2, 7, 12], "up": [1, 2, 10, 11], "sub": 1, "object": [1, 2, 6, 10], "document": [1, 8, 10, 12], "sp_caution": [1, 2, 12], "spouse_bpl": 1, "spouse_birthyr": 1, "durmarr": [1, 2], "new_marr": [1, 2], "street_jw": [1, 2, 12], "9": 1, "multipli": 1, "after": [1, 2, 4, 8, 10], "float": [1, 2, 8], "comp": 1, "c": 1, "sploc": 1, "012": 1, "fals": [1, 2, 3, 4, 6, 10], "d": 1, "under": [1, 2], "specif": [1, 2, 10], "circumst": 1, "should": [1, 2, 3, 8, 9, 10], "mid_init_match": 1, "either_1": 1, "nativ": 1, "either_0": 1, "gen": 1, "imm": [1, 2, 12], "immigr": 1, "look": [1, 10, 11], "foreign": 1, "born": 1, "sgen": [1, 2, 12], "rel": [1, 2, 12], "scala": 1, "determin": [1, 7], "greater": [1, 5], "jw_threshold": 1, "less": [1, 2], "age_threshold": 1, "sex": [1, 2, 11], "sampl": 1, "related_individual_row": 1, "unrel": 1, "depend": [1, 2, 5, 12], "name_col": 1, "birthyr_col": 1, "namefrst_related_row": 1, "replaced_birthyr": [1, 2, 3], "extra": 1, "children": 1, "who": 1, "base": [1, 2, 7], "expect": 1, "count": [1, 10, 12], "suspect": [1, 6], "relate_col": 1, "histid_col": 1, "id": [1, 2], "birth": 1, "year_b": 1, "wa": [1, 12], "minimum": [1, 8], "accept": [1, 2, 12], "consid": [1, 8], "histid": [1, 2, 12], "1910": [1, 2, 12], "8": [1, 2, 5, 10], "rate": 1, "calcul": [1, 12], "percentag": 1, "seen": 1, "neighbor": 1, "meet": 1, "95": 1, "nbor": [1, 2, 12], "namelast_neighbor": 1, "05": [1, 2], "namelast_popularity_sum": 1, "namelast_popular": 1, "length": [1, 2, 9], "size": 1, "ab": 1, "diff": 1, "absolut": 1, "invalid": [1, 8], "instead": [1, 2, 3, 5, 7], "marriag": 1, "durat": 1, "99": [1, 2], "placehold": 1, "unknown": 1, "exclud": 1, "those": [1, 2], "consider": 1, "byrdiff": [1, 2, 12], "mardurmatch": [1, 2], "14": 1, "minu": [1, 2], "subtract": 1, "geo": 1, "distanc": [1, 8], "lookup": 1, "tabl": [1, 2, 4, 7, 10, 12], "core": [1, 7, 10, 12], "dist_tabl": 1, "py": [1, 2], "There": [1, 2, 7], "sever": [1, 6], "wai": [1, 5, 10], "file": [1, 3, 4, 6, 7, 10, 11, 12], "kei": [1, 7, 10], "key_count": 1, "secondari": 1, "serv": 1, "back": 1, "primari": [1, 6], "doe": [1, 3, 7, 12], "particularli": 1, "state": [1, 6], "much": [1, 7], "fewer": [1, 8], "combin": [1, 2, 3, 7], "thu": 1, "risk": 1, "fill": 1, "aren": 1, "ex": 1, "just": [1, 2, 10, 12], "even": 1, "though": 1, "distances_fil": 1, "path": [1, 2, 10, 11, 12], "table_nam": 1, "what": [1, 2, 10, 12], "onc": [1, 10], "loc_a": 1, "where": [1, 7, 10, 12], "come": 1, "loc_b": 1, "distance_col": 1, "source_column_a": 1, "sourc": [1, 4, 7, 10, 12], "source_column_b": 1, "loc_a_0": 1, "loc_a_1": 1, "loc_b_0": 1, "loc_b_1": 1, "secondary_key_count": 1, "backup": 1, "secondary_table_nam": 1, "secondary_distances_fil": 1, "secondary_source_column": 1, "secondary_loc_a": 1, "secondary_loc_b": 1, "secondary_distance_col": 1, "state_dist": 1, "state_distance_lookup": 1, "county_state_dist": 1, "csv": [1, 2, 7, 10, 11, 12], "statecode1": 1, "statecode2": 1, "dist": 1, "county_dist": [1, 2, 12], "county_distance_lookup": 1, "county_1900_1910_distances_km": 1, "from_icpsrctyi": 1, "to_icpsrctyi": 1, "from_statefip": 1, "to_statefip": 1, "distance_km": 1, "state_1900_1910_distances_km": 1, "fetch": 1, "neither": 1, "nor": 1, "mpre": 1, "m_namefrst": 1, "accord": 1, "niu": 1, "other": [1, 2, 12], "mfbplmatch": 1, "multi": 1, "search": 1, "special": 1, "simplifi": 1, "particular": [1, 2], "constraint": 1, "num_col": 1, "whose": 1, "templat": 1, "n": [1, 8, 9], "per": [1, 2, 8, 9, 10], "current": [1, 2, 10], "respect": [1, 7], "jw_col_templ": 1, "jw": 1, "pair": [1, 12], "equal_and_not_null_templ": 1, "final": [1, 2, 12], "comput": [1, 3, 7], "_namefrst": 1, "_bpl": 1, "_sex": 1, "25": 1, "nvl": 1, "sm_namefrst": 1, "sn_namefrst": 1, "sm_bpl": 1, "sn_bpl": 1, "sm_sex": 1, "sn_sex": 1, "pass": [1, 2, 7, 8], "flexibl": 1, "user": [1, 10], "write": [1, 10, 12], "own": [1, 2], "favor": 1, "reason": 1, "good": 1, "fallback": 1, "defin": [1, 7, 8, 9, 10], "spark": [1, 2, 3, 5, 8, 9, 10, 12], "builtin": 1, "argument": [1, 10, 12], "namelast_jw_max": 1, "namelast1": 1, "namelast2": 1, "namelast3": 1, "abov": [1, 5], "extend": 1, "beyond": 1, "top": [1, 4], "level": [1, 4, 10], "everi": [1, 3], "jw_f": [1, 2, 12], "father_namefrst": 1, "rais": [1, 3], "exponenti": 1, "squar": 1, "county_distance_squar": [1, 2, 12], "county_a": 1, "county_b": 1, "upper": 1, "gt": 1, "btwn": 1, "addl": 1, "var": [1, 2], "program": [1, 2, 7, 12], "report": [1, 4, 6, 10], "addl_var": 1, "check_val_expr": 1, "else_v": 1, "volumn": 1, "datasourc": [1, 2, 10], "yrimmig": 1, "immyear_diff": [1, 2, 9, 12], "includ": [1, 2, 7, 9, 10], "train": [1, 4, 6, 8, 10], "independent_var": [1, 2, 12], "config": [1, 3, 4, 7, 10, 12], "id_column": [1, 2], "_a": 1, "mult": 1, "exist": [1, 2, 10], "within": [1, 2, 6, 10, 11], "hh_train": [1, 2, 7, 10, 12], "hh": 1, "highest": [1, 2], "against": [1, 11], "ten": [1, 2], "tell": [2, 3], "how": [2, 7], "descript": [2, 8, 10], "refer": 2, "here": [2, 7, 10, 12], "tutori": [2, 10], "script": [2, 6, 10], "discuss": 2, "readm": 2, "note": 2, "written": [2, 6], "toml": [2, 6, 10], "abl": 2, "json": [2, 10], "datasource_a": [2, 7], "datasource_b": [2, 7], "transform": [2, 4, 6, 7], "lowercase_strip": 2, "add_to_a": 2, "age_2": 2, "derived_from": 2, "expand_length": 2, "explod": [2, 7], "jaro_winkl": 2, "namelast_jw": [2, 12], "threshold": [2, 8, 12], "feature_nam": 2, "79": 2, "84": 2, "complex": [2, 3], "machin": [2, 6, 7, 10, 12], "learn": [2, 6, 7, 10, 12], "probabilist": [2, 6], "drop_data_from_scored_match": 2, "us1900": 2, "us1900m_usa": 2, "p": 2, "parquet": [2, 7], "us1910": 2, "us1910m_usa": 2, "training_data_subset": 2, "serialp": 2, "rationalize_name_word": 2, "remove_qmark_hyphen": 2, "replace_apostroph": 2, "remove_suffix": 2, "remove_alternate_nam": 2, "condense_strip_whitespac": 2, "split": [2, 3, 7, 8, 9, 12], "namefrst_std": [2, 11], "array_index": 2, "bpl_orig": 2, "divide_by_int": 2, "get_floor": 2, "statefip_h": 2, "output_typ": 2, "substitution_column": [2, 7, 11], "join_column": [2, 11], "join_valu": [2, 11], "substitution_fil": [2, 11], "name_std": [2, 11], "male": [2, 11], "femal": [2, 11], "feature_select": [2, 3, 7], "input_column": [2, 3, 9], "output_column": [2, 3, 9], "sql_condit": 2, "namelast_bigram": 2, "bigram": [2, 4], "bpl_clean": 2, "bpl_str": 2, "washington": 2, "bpl2_str": 2, "53": 2, "region": [2, 12], "attach_vari": 2, "region_dict": 2, "col_to_join_on": 2, "col_to_add": 2, "null_fil": 2, "col_typ": 2, "potential_matches_univers": [2, 7], "birthyr_3": 2, "namefrst_std_jw": [2, 12], "75": [2, 8, 12], "comparis": 2, "post": [2, 7], "hh_comparison": [2, 7], "threshold_expr": 2, "fetch_a": 2, "sex_equ": 2, "equal": [2, 11], "relate_a": [2, 9], "pipeline_featur": [2, 7, 9], "sex_region_interact": 2, "transformer_typ": [2, 9], "interact": [2, 4, 7, 12], "relatetyp": [2, 9], "bucket": [2, 7], "hit": [2, 10, 12], "scale_data": [2, 12], "training_data": [2, 10], "dependent_var": [2, 12], "score_with_model": [2, 12], "use_training_data_featur": [2, 7, 12], "split_by_id_a": [2, 12], "decis": [2, 4, 8, 12], "drop_duplicate_with_threshold_ratio": [2, 12], "n_training_iter": [2, 7, 12], "output_suspicious_td": [2, 12], "param_grid": [2, 12], "model_paramet": [2, 7, 8, 12], "random_forest": [2, 12], "maxdepth": [2, 8, 12], "numtre": [2, 8, 12], "005": 2, "threshold_ratio": [2, 8, 12], "logistic_regress": [2, 12], "50": [2, 12], "65": 2, "80": 2, "chosen_model": [2, 8, 12], "prediction_col": 2, "predict": [2, 12], "hh_col": 2, "hh_training_data_1900_1910": 2, "probit": [2, 4], "go": [2, 10], "your": [2, 3, 5, 7, 10, 12], "uniqu": 2, "identifi": [2, 6, 12], "full": [2, 7, 12], "short": 2, "alphanumer": 2, "convert_ints_to_long": 2, "long": [2, 11], "especi": 2, "assum": 2, "schema": 2, "sometim": 2, "term": 2, "bigint": 2, "thing": 2, "my_fil": 2, "subset": [2, 11], "limit": 2, "extract": 2, "modifi": 2, "meant": 2, "usag": [2, 3, 4, 10], "set_value_column_a": [2, 3], "liter": 2, "set_value_column_b": [2, 3], "iv": 2, "v": 2, "vi": 2, "vii": 2, "viii": 2, "namelast_clean_bigram": [2, 3], "fed": [2, 7], "prep": 2, "df": [2, 10], "men": 2, "newli": 2, "attempt": 2, "duplic": [2, 8], "conjuct": 2, "Will": 2, "conjunct": 2, "rang": [2, 9], "original_valu": 2, "plu": 2, "1870": 2, "expand": 2, "1867": 2, "1868": 2, "1869": 2, "1871": 2, "1872": 2, "1873": 2, "kept": 2, "keep": 2, "appropri": 2, "treat": [2, 9], "import": [2, 7, 10, 12], "dure": [2, 7], "hot": 2, "encod": [2, 3], "vector": [2, 9], "stage": 2, "well": 2, "upper_threshold": 2, "cannot": 2, "robust": 2, "ml": [2, 4, 8, 9], "typic": [2, 7], "leverag": 2, "api": [2, 6, 9], "piplin": 2, "regionf": 2, "sex_regionf_interact": 2, "immyear_caut": [2, 9], "myriad": 2, "explor": [2, 4, 6, 10], "part": [2, 7], "task": [2, 4, 6, 8, 12], "drop_duplicate_a": 2, "out": [2, 3, 7, 12], "best": [2, 7], "smallest": 2, "possibl": 2, "ratio": [2, 8], "beta": [2, 8], "test": [2, 7, 12], "model_explor": [2, 10, 12], "hyper": [2, 6, 12], "paramet": [2, 6, 7, 8, 10, 12], "eval": 2, "skip": [2, 3, 7], "apply_model": 2, "run_all_step": [2, 10, 12], "command": [2, 6, 10, 12], "try": 2, "creation": 2, "iter": 2, "scale": 2, "error": [2, 9], "1900": [2, 12], "about": [2, 10, 12], "1930": [2, 12], "1940": [2, 12], "fail": 2, "were": 2, "sure": [2, 5, 10], "scratch": 2, "although": 2, "know": 2, "haven": 2, "save": [2, 7, 12], "small": 2, "amount": 2, "process": [2, 6, 10], "repeatedli": 2, "help": [2, 7, 10], "neg": [2, 4, 6], "area": 2, "coverag": 2, "increas": [2, 9], "represent": [2, 7], "ensur": 2, "group": [2, 7], "a304bt": 2, "three": [2, 7], "b200": 2, "c201": 2, "d425": 2, "perform": [2, 6, 7, 11], "feature_import": [2, 7, 12], "coeffici": [2, 7], "enabl": [2, 7, 10], "srace": [2, 9, 12], "race_interacted_srac": [2, 9, 12], "hits2": [2, 12], "exact_mult": [2, 12], "ncount": [2, 3, 12], "ncount2": [2, 3, 12], "f_interacted_jw_f": [2, 12], "f_caution": [2, 12], "f_pre": [2, 12], "fbplmatch": [2, 12], "m_interacted_jw_m": [2, 9, 12], "jw_m": [2, 9, 12], "m_pre": [2, 9, 12], "mbplmatch": [2, 12], "sp_interacted_jw_sp": [2, 12], "jw_sp": [2, 12], "sp_pre": [2, 12], "mi": [2, 12], "fsoundex": [2, 12], "lsoundex": [2, 12], "oth": [2, 12], "imm_interacted_immyear_caut": [2, 12], "1900_1910_training_data_20191023": 2, "jw_max_a": 2, "jw_max_b": 2, "f1_match": 2, "f2_match": 2, "byrdifcat": 2, "racematch": 2, "bplmatch": 2, "imm_interacted_bplmatch": 2, "sexmatch": 2, "relatetype_interacted_relatematch": 2, "checkpoint": 3, "no_first_pad": 3, "don": [2, 3], "prepend": 3, "namefrst_unstd_bigram": 3, "namelast_frst_bigram": 3, "namelast_clean_soundex": 3, "input_col": 3, "output_col": 3, "expon": 3, "introduct": 4, "overview": 4, "instal": 4, "pypi": 4, "preprocess": [4, 6, 10, 12], "model": [4, 6, 10], "run": [4, 5, 6, 7, 12], "librari": [4, 6], "mode": [4, 5, 12], "advanc": 4, "workflow": [3, 4], "export": [4, 7, 10], "featur": [4, 6, 7, 8, 10], "reus": 4, "basic": 4, "map": [4, 7, 9], "substitut": [4, 7], "block": [4, 7], "comparison": [4, 7], "pipelin": 4, "ons": 4, "aggreg": 4, "union": 4, "soundex": 4, "power": 4, "regex": 4, "random": [4, 8], "forest": [4, 8], "logist": [4, 8], "regress": [4, 8], "tree": [4, 8], "gradient": [4, 8], "boost": [4, 8], "system": 5, "python": [5, 6, 10], "java": 5, "integr": 5, "apach": 5, "via": [5, 6], "pyspark": [5, 8, 9, 10], "packag": 5, "org": 5, "latest": 5, "pip": 5, "easiest": [5, 10], "through": [5, 7, 9, 10], "instruct": [5, 10], "But": [2, 5], "clone": 5, "github": 5, "repositori": 5, "root": 5, "project": 5, "directori": [5, 10, 12], "develop": [5, 6], "e": 5, "dev": 5, "edit": 5, "made": 5, "built": 5, "tool": [5, 6], "line": [6, 10], "share": 6, "characterist": [6, 7], "correspond": [6, 7], "real": 6, "world": 6, "determinist": [6, 7], "rule": [6, 7], "algorithm": [6, 7], "At": [6, 7], "been": 6, "unit": 6, "census": 6, "hierarch": [6, 10], "structur": 6, "nest": 6, "howev": [3, 6, 12], "tailor": 6, "ignor": 6, "common": [6, 7, 12], "highli": [6, 7], "languag": 6, "further": [6, 12], "broken": 6, "smaller": 6, "sequenc": 6, "linkrun": [6, 10], "prepar": [6, 7, 10], "research": 6, "experi": 6, "understand": 6, "tune": [6, 12], "relationship": 6, "varieti": 7, "normal": 7, "abbrevi": [7, 11], "regist": [7, 10], "datafram": [7, 10, 12], "request": 7, "classif": [7, 8], "metadata": 7, "introspect": 7, "ingest": 7, "inspect": 7, "mani": [7, 10], "aspect": [7, 10], "extens": 7, "longest": 7, "definit": 7, "reduc": [3, 7], "drastic": 7, "improv": 7, "runtim": 7, "separ": 7, "total": 7, "potential_match": [7, 10], "satisfi": 7, "elig": 7, "reshap": 7, "thought": 7, "ahead": 7, "chosen": 7, "experiment": [7, 10], "focus": 7, "demograph": 7, "moment": 7, "veri": [3, 7, 12], "anyon": 7, "percent": 7, "remain": 7, "popul": 7, "pull": 7, "fix": 7, "width": 7, "crosswalk": 7, "construct": 7, "alpha": 8, "hyperparamet": [8, 12], "de": 8, "param": [8, 12], "label": 8, "doc": [8, 9], "commonli": 8, "explan": 8, "randomforestclassifi": 8, "depth": 8, "20": 8, "featuresubsetstrategi": 8, "node": 8, "auto": 8, "onethird": 8, "sqrt": 8, "log2": 8, "15": 8, "generalizedlinearregress": 8, "famili": 8, "binomi": 8, "85": [8, 10], "logisticregress": 8, "decisiontreeclassifi": 8, "mininstancespernod": 8, "caus": 8, "left": 8, "right": [8, 10], "discard": 8, "maxbin": 8, "bin": 8, "discret": 8, "continu": [8, 9, 12], "gbtclassifi": 8, "mother": 9, "point": [9, 12], "x": [9, 10], "y": 9, "hold": 9, "except": 9, "strictli": 9, "inf": 9, "explicitli": 9, "cover": 9, "doubl": 9, "outsid": 9, "job": 10, "high": 10, "class": 10, "handl": 10, "main": 10, "complet": 10, "access": [10, 12], "link_run": 10, "factori": 10, "sparkfactori": 10, "load_config": 10, "load_conf_fil": 10, "sparksess": 10, "now": 10, "let": 10, "load": 10, "our": 10, "my_conf": 10, "lr": 10, "prep_step": 10, "get_step": 10, "enumer": 10, "print": 10, "input_table_nam": 10, "output_table_nam": 10, "run_step": 10, "get_tabl": 10, "matches_df": 10, "hh_model_explor": 10, "method": [10, 12], "interfac": 10, "easili": 10, "conveni": 10, "adjust": 10, "set_loc": 10, "set_num_cor": 10, "set_executor_memori": 10, "5g": 10, "ll": 10, "dictionari": 10, "often": [3, 10], "modul": 10, "pleas": 10, "reproduc": 10, "consol": 10, "cpu": 10, "h": 10, "executor_memori": [10, 12], "execute_task": 10, "execute_command": 10, "conf": [10, 12], "show": 10, "messag": 10, "exit": 10, "memori": 10, "executor": 10, "begin": 10, "execut": 10, "seri": 10, "excute_command": 10, "filepath": 10, "sai": 10, "fullcount_1870_1880": 10, "pattern": 10, "full_count_1870_1880": 10, "prompt": 10, "enter": 10, "text": 10, "unstabl": 10, "topic": 10, "analyz": [10, 12], "set_preexisting_t": 10, "x_persist": 10, "borrow_t": 10, "get_task": 10, "set_print_sql": 10, "x_sql": 10, "x_sqlf": 10, "ipython": 10, "showf": 10, "x_summari": 10, "desc": 10, "x_crosswalk": 10, "x_tab": 10, "q": [10, 12], "x_hh_tfam": 10, "x_tfam": 10, "drop_al": 10, "reload": 10, "x_hh_tfam_2a": 10, "x_tfam_raw": 10, "drop_all_prc": 10, "x_hh_tfam_2b": 10, "x_union": 10, "drop_all_temp": 10, "x_load": 10, "get_set": 10, "set_link_task": 10, "x_parquet_from_csv": 10, "organ": 10, "hierarchi": 10, "five": 10, "hh_match": 10, "someth": 10, "choic": 10, "preexist": 10, "prepped_df_a": 10, "prepped_df_b": 10, "raw_df_b": 10, "raw_df_a": 10, "training_featur": [10, 12], "scored_potential_match": 10, "potential_matches_prep": 10, "exploded_df_b": 10, "exploded_df_a": 10, "predicted_match": 10, "hh_training_featur": [10, 12], "hh_training_data": 10, "hh_predicted_match": 10, "hh_scored_potential_match": 10, "hh_potential_match": 10, "hh_blocked_match": 10, "hh_potential_matchs_prep": 10, "model_eval_training_vector": 10, "model_eval_training_data": 10, "model_eval_repeat_fp": 10, "model_eval_training_featur": 10, "model_eval_training_result": 10, "model_eval_repeat_fn": 10, "hh_model_eval_training_vector": 10, "hh_model_eval_repeat_fp": 10, "hh_model_eval_repeat_fn": 10, "hh_model_eval_training_result": 10, "hh_model_eval_training_featur": 10, "hh_model_eval_training_data": 10, "persist": 10, "hidden": 10, "intermedi": 10, "yet": 10, "databas": 10, "tablenam": 10, "istemporari": 10, "task_nam": 10, "num": 10, "finish": 10, "put": [10, 12], "launch": [10, 12], "my": [10, 12], "subhead": 11, "suppli": 11, "regex_word_replac": 11, "variant": 11, "av": 11, "7th": 11, "swap": 11, "still": 11, "anywher": 11, "proceed": 11, "street_unstd": 11, "dir": 11, "substitutions_street_abbrev": 11, "span": 12, "1920": 12, "deriv": 12, "necessari": [3, 12], "scenario": 12, "copi": [3, 12], "use_potential_matches_featur": 12, "full_count_1900_1910": 12, "50g": 12, "ask": 12, "arg": 12, "partit": 12, "training_data_1900_1910_hlink_featur": 12, "might": 12, "shut": 12, "framework": 12, "etc": 12, "relev": 12, "matrix": 12, "implement": 12, "regular": 12, "training_data_1900_1910": 12, "weren": 12, "ident": 12, "manual": 12, "updat": 12, "isn": 12, "analysi": 12, "training_result": 12, "hh_training_result": 12, "1900_1910_training_result": 12, "repeat_fp": 12, "repeat_fn": 12, "hh_repeat_fp": 12, "hh_repeat_fn": 12, "1900_1910_potential_fp": 12, "1900_1910_potential_fn": 12, "prefer": 12, "ve": 12, "or_group": 2, "belong": 2, "bpl1": 2, "bpl2": 2, "bpl3": 2, "parenthes": 2, "around": 2, "connect": 2, "few": 3, "util": 3, "resourc": 3, "affect": 3, "effect": 3, "comment": 3}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"column": [0, 2], "map": [0, 2], "basic": [0, 2], "usag": 0, "advanc": [0, 2, 12], "transform": [0, 1, 3, 9], "add_to_a": 0, "concat_to_a": 0, "concat_to_b": 0, "concat_two_col": 0, "lowercase_strip": 0, "rationalize_name_word": 0, "remove_qmark_hyphen": 0, "remove_punctu": 0, "replace_apostroph": 0, "remove_alternate_nam": 0, "remove_suffix": 0, "remove_stop_word": 0, "remove_prefix": 0, "condense_strip_whitespac": 0, "remove_one_letter_nam": 0, "split": 0, "array_index": 0, "substr": 0, "divide_by_int": 0, "when_valu": 0, "get_floor": 0, "comparison": [1, 2], "type": [1, 9], "add": 1, "ons": 1, "aggreg": 1, "featur": [1, 2, 3, 9, 12], "household": [1, 2, 7], "maximum_jaro_winkl": 1, "jaro_winkl": 1, "jaro_winkler_street": 1, "max_jaro_winkl": 1, "equal": 1, "f1_match": 1, "f2_match": 1, "not_equ": 1, "equals_as_int": 1, "all_equ": 1, "not_zero_and_not_equ": 1, "time": 1, "caution_comp_3": 1, "caution_comp_3_012": 1, "caution_comp_4": 1, "caution_comp_4_012": 1, "any_equ": 1, "either_are_1": 1, "either_are_0": 1, "second_gen_imm": 1, "rel_jaro_winkl": 1, "extra_children": 1, "jaro_winkler_r": 1, "sum": 1, "length_b": 1, "abs_diff": 1, "b_minus_a": 1, "geo_dist": 1, "fetch_a": 1, "fetch_b": 1, "present_both_year": 1, "neither_are_nul": 1, "present_and_matching_categor": 1, "present_and_not_equ": 1, "present_and_equal_categorical_in_univers": 1, "multi_jaro_winkler_search": 1, "sql_condit": [1, 3], "alia": 1, "power": [1, 3], "threshold": 1, "lower_threshold": 1, "upper_threshold": 1, "gt_threshold": 1, "btwn_threshold": 1, "look_at_addl_var": 1, "hit": 1, "hits2": 1, "exact_mult": 1, "jw_max_a": 1, "jw_max_b": 1, "configur": [2, 4, 7], "config": 2, "file": 2, "top": 2, "level": 2, "data": [2, 11, 12], "sourc": [2, 5], "filter": 2, "substitut": [2, 11], "select": [2, 3], "potenti": [2, 12], "match": [2, 7], "univers": 2, "block": 2, "pipelin": [2, 9], "gener": [2, 9, 12], "train": [2, 7, 12], "model": [2, 7, 8, 12], "bigram": 3, "arrai": 3, "union": 3, "soundex": 3, "welcom": 4, "hlink": [4, 10], "": 4, "document": 4, "api": 4, "instal": 5, "requir": 5, "from": 5, "pypi": 5, "introduct": 6, "overview": [6, 7], "link": [7, 10, 12], "task": [7, 10], "preprocess": 7, "step": [7, 10], "relat": 7, "section": 7, "explor": [7, 12], "report": 7, "random_forest": 8, "probit": 8, "logistic_regress": 8, "decision_tre": 8, "gradient_boosted_tre": 8, "interact": [9, 10], "bucket": 9, "run": 10, "us": 10, "librari": 10, "mode": 10, "start": 10, "program": 10, "exampl": [10, 12], "workflow": [10, 12], "1": 11, "tabl": 11, "regex": 11, "word": 11, "replac": 11, "export": 12, "after": 12, "reus": 12, "differ": 12, "year": 12, "ml": 12, "list": 12, "fals": 12, "posit": 12, "neg": 12, "fp": 12, "fn": 12}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 60}, "alltitles": {"Column Mappings": [[0, "column-mappings"], [2, "column-mappings"]], "Basic Usage": [[0, "basic-usage"]], "Advanced Usage": [[0, "advanced-usage"]], "Transforms": [[0, "transforms"]], "add_to_a": [[0, "add-to-a"]], "concat_to_a": [[0, "concat-to-a"]], "concat_to_b": [[0, "concat-to-b"]], "concat_two_cols": [[0, "concat-two-cols"]], "lowercase_strip": [[0, "lowercase-strip"]], "rationalize_name_words": [[0, "rationalize-name-words"]], "remove_qmark_hyphen": [[0, "remove-qmark-hyphen"]], "remove_punctuation": [[0, "remove-punctuation"]], "replace_apostrophe": [[0, "replace-apostrophe"]], "remove_alternate_names": [[0, "remove-alternate-names"]], "remove_suffixes": [[0, "remove-suffixes"]], "remove_stop_words": [[0, "remove-stop-words"]], "remove_prefixes": [[0, "remove-prefixes"]], "condense_strip_whitespace": [[0, "condense-strip-whitespace"]], "remove_one_letter_names": [[0, "remove-one-letter-names"]], "split": [[0, "split"]], "array_index": [[0, "array-index"]], "mapping": [[0, "mapping"]], "substring": [[0, "substring"]], "divide_by_int": [[0, "divide-by-int"]], "when_value": [[0, "when-value"]], "get_floor": [[0, "get-floor"]], "Comparison types, transform add-ons, aggregate features, and household aggregate features": [[1, "comparison-types-transform-add-ons-aggregate-features-and-household-aggregate-features"]], "Comparison types": [[1, "comparison-types"]], "maximum_jaro_winkler": [[1, "maximum-jaro-winkler"]], "jaro_winkler": [[1, "jaro-winkler"]], "jaro_winkler_street": [[1, "jaro-winkler-street"]], "max_jaro_winkler": [[1, "max-jaro-winkler"]], "equals": [[1, "equals"]], "f1_match": [[1, "f1-match"]], "f2_match": [[1, "f2-match"]], "not_equals": [[1, "not-equals"]], "equals_as_int": [[1, "equals-as-int"]], "all_equals": [[1, "all-equals"]], "not_zero_and_not_equals": [[1, "not-zero-and-not-equals"]], "or": [[1, "or"]], "and": [[1, "and"]], "times": [[1, "times"]], "caution_comp_3": [[1, "caution-comp-3"]], "caution_comp_3_012": [[1, "caution-comp-3-012"]], "caution_comp_4": [[1, "caution-comp-4"]], "caution_comp_4_012": [[1, "caution-comp-4-012"]], "any_equals": [[1, "any-equals"]], "either_are_1": [[1, "either-are-1"]], "either_are_0": [[1, "either-are-0"]], "second_gen_imm": [[1, "second-gen-imm"]], "rel_jaro_winkler": [[1, "rel-jaro-winkler"]], "extra_children": [[1, "extra-children"]], "jaro_winkler_rate": [[1, "jaro-winkler-rate"]], "sum": [[1, "sum"]], "length_b": [[1, "length-b"]], "abs_diff": [[1, "abs-diff"]], "b_minus_a": [[1, "b-minus-a"]], "geo_distance": [[1, "geo-distance"]], "fetch_a": [[1, "fetch-a"]], "fetch_b": [[1, "fetch-b"]], "present_both_years": [[1, "present-both-years"]], "neither_are_null": [[1, "neither-are-null"]], "present_and_matching_categorical": [[1, "present-and-matching-categorical"]], "present_and_not_equal": [[1, "present-and-not-equal"]], "present_and_equal_categorical_in_universe": [[1, "present-and-equal-categorical-in-universe"]], "multi_jaro_winkler_search": [[1, "multi-jaro-winkler-search"]], "sql_condition": [[1, "sql-condition"], [3, "sql-condition"]], "Feature add-ons": [[1, "feature-add-ons"]], "alias": [[1, "alias"]], "power": [[1, "power"], [3, "power"]], "threshold": [[1, "threshold"]], "lower_threshold": [[1, "lower-threshold"]], "upper_threshold": [[1, "upper-threshold"]], "gt_threshold": [[1, "gt-threshold"]], "btwn_threshold": [[1, "btwn-threshold"]], "look_at_addl_var": [[1, "look-at-addl-var"]], "Aggregate Features": [[1, "aggregate-features"]], "hits": [[1, "hits"]], "hits2": [[1, "hits2"]], "exact_mult": [[1, "exact-mult"]], "Household Aggregate Features": [[1, "household-aggregate-features"]], "jw_max_a": [[1, "jw-max-a"]], "jw_max_b": [[1, "jw-max-b"]], "Configuration": [[2, "configuration"]], "Basic Config File": [[2, "basic-config-file"]], "Advanced Config File": [[2, "advanced-config-file"]], "Top level configs": [[2, "top-level-configs"]], "Data sources": [[2, "data-sources"]], "Filter": [[2, "filter"]], "Substitution Columns": [[2, "substitution-columns"]], "Feature Selections": [[2, "feature-selections"]], "Potential Matches Universe": [[2, "potential-matches-universe"]], "Blocking": [[2, "blocking"]], "Comparisons": [[2, "comparisons"]], "Household Comparisons": [[2, "household-comparisons"]], "Comparison Features": [[2, "comparison-features"]], "Pipeline-generated Features": [[2, "pipeline-generated-features"]], "Training and models": [[2, "training-and-models"]], "Household training and models": [[2, "household-training-and-models"]], "Welcome to hlink\u2019s documentation!": [[4, "welcome-to-hlink-s-documentation"]], "Configuration API": [[4, "configuration-api"], [4, null]], "Installation": [[5, "installation"]], "Requirements": [[5, "requirements"]], "Installing from PyPI": [[5, "installing-from-pypi"]], "Installing from source": [[5, "installing-from-source"]], "Introduction": [[6, "introduction"]], "Overview": [[6, "overview"], [7, "overview"], [7, "id1"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"]], "Link Tasks": [[7, "link-tasks"]], "Preprocessing": [[7, "preprocessing"]], "Task steps": [[7, "task-steps"], [7, "id2"], [7, "id5"], [7, "id8"], [7, "id11"], [7, "id14"]], "Related Configuration Sections": [[7, "related-configuration-sections"], [7, "id3"], [7, "id6"], [7, "id9"], [7, "id12"], [7, "id15"]], "Training and Household Training": [[7, "training-and-household-training"]], "Matching": [[7, "matching"]], "Household Matching": [[7, "household-matching"]], "Model Exploration and Household Model Exploration": [[7, "model-exploration-and-household-model-exploration"]], "Reporting": [[7, "reporting"]], "Models": [[8, "models"]], "random_forest": [[8, "random-forest"]], "probit": [[8, "probit"]], "logistic_regression": [[8, "logistic-regression"]], "decision_tree": [[8, "decision-tree"]], "gradient_boosted_trees": [[8, "gradient-boosted-trees"]], "Pipeline generated features": [[9, "pipeline-generated-features"]], "Transformer types": [[9, "transformer-types"]], "interaction": [[9, "interaction"]], "bucketizer": [[9, "bucketizer"]], "Running hlink": [[10, "running-hlink"]], "Using hlink as a Library": [[10, "using-hlink-as-a-library"]], "Interactive Mode": [[10, "interactive-mode"]], "Starting the program": [[10, "starting-the-program"]], "Running Linking Tasks and Steps": [[10, "running-linking-tasks-and-steps"]], "Example interactive mode workflow": [[10, "example-interactive-mode-workflow"]], "Substitutions": [[11, "substitutions"]], "1:1 substitution by data table": [[11, "substitution-by-data-table"]], "Substitution by regex word replace": [[11, "substitution-by-regex-word-replace"]], "Advanced Workflow Examples": [[12, "advanced-workflow-examples"]], "Export training data after generating features to reuse in different linking years": [[12, "export-training-data-after-generating-features-to-reuse-in-different-linking-years"]], "Example training data export with generated ML features": [[12, "example-training-data-export-with-generated-ml-features"]], "ML model exploration and export of lists of potential false positives/negatives in training data": [[12, "ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data"]], "Example model exploration and FP/FN export workflow": [[12, "example-model-exploration-and-fp-fn-export-workflow"]], "Feature Selection Transforms": [[3, "feature-selection-transforms"]], "bigrams": [[3, "bigrams"]], "array": [[3, "array"]], "union": [[3, "union"]], "soundex": [[3, "soundex"]]}, "indexentries": {}}) \ No newline at end of file diff --git a/hlink/linking/core/transforms.py b/hlink/linking/core/transforms.py index 0c771f0..79df12e 100755 --- a/hlink/linking/core/transforms.py +++ b/hlink/linking/core/transforms.py @@ -30,6 +30,304 @@ from pyspark.ml.feature import NGram, RegexTokenizer, CountVectorizer, MinHashLSH +def _get_transforms( + feature_selections: list[dict[str, Any]], name: str, is_a: bool +) -> list[dict[str, Any]]: + """ + Filter the given list of feature selections for those that have the + transform `name` and are active for the datasource indicated by `is_a`. + + feature_selections: the list of feature selections to filter + name: the name of the transform to filter for, e.g. "neighbor_aggregate" + is_a: whether this is for datasource A (True) or datasource B (False) + """ + to_process = [] + for feature_selection in feature_selections: + if ("override_column_a" in feature_selection) and is_a: + pass + elif ("override_column_b" in feature_selection) and not is_a: + pass + elif ("set_value_column_a" in feature_selection) and is_a: + pass + elif ("set_value_column_b" in feature_selection) and not is_a: + pass + elif feature_selection["transform"] == name: + to_process.append(feature_selection) + + return to_process + + +def _parse_feature_selections( + spark: SparkSession, + link_task, + df_selected: DataFrame, + feature_selection: dict[str, Any], + id_col: str, + is_a: bool, +) -> DataFrame: + """ + Parse the `feature_selection` and add it to `df_selected` as a new column. + This looks at what type of transform the `feature_selection` is to + determine how to compute it. Note that this function adds the new column to + the return data frame lazily and does not `collect()` the data frame. + + spark: the Spark session + link_task: the current link task + df_selected: the data frame to use for computation + feature_selection: the feature selection to compute, which may depend on + columns in `df_selected` + id_col: the identifier column for the data frame + is_a: whether this is for datasource A (True) or datasource B (False) + """ + transform = feature_selection["transform"] + + if not feature_selection.get("output_column", False): + feature_selection["output_column"] = feature_selection["output_col"] + + if "checkpoint" in feature_selection and feature_selection["checkpoint"]: + df_selected = df_selected.checkpoint() + + if "override_column_a" in feature_selection and is_a: + override_name = feature_selection["override_column_a"] + df_selected = df_selected.withColumn( + feature_selection["output_column"], df_selected[override_name] + ) + return df_selected + + elif "override_column_b" in feature_selection and not is_a: + override_name = feature_selection["override_column_b"] + df_selected = df_selected.withColumn( + feature_selection["output_column"], df_selected[override_name] + ) + return df_selected + + elif "set_value_column_a" in feature_selection and is_a: + set_value = feature_selection["set_value_column_a"] + df_selected = df_selected.withColumn( + feature_selection["output_column"], lit(set_value) + ) + return df_selected + + elif "set_value_column_b" in feature_selection and not is_a: + set_value = feature_selection["set_value_column_b"] + df_selected = df_selected.withColumn( + feature_selection["output_column"], lit(set_value) + ) + return df_selected + + elif transform == "bigrams": + input_col = feature_selection["input_column"] + output_col = feature_selection["output_column"] + intermediate_col = input_col + "_tokens" + unsorted_col = input_col + "_unsorted" + if "no_first_pad" in feature_selection and feature_selection["no_first_pad"]: + input_col_space = input_col + else: + input_col_space = input_col + "_space" + df_selected = df_selected.withColumn( + input_col_space, concat(lit(" "), input_col) + ) + tokenizer_a = RegexTokenizer( + pattern="", inputCol=input_col_space, outputCol=intermediate_col + ) + ngram_a = NGram(n=2, inputCol=intermediate_col, outputCol=output_col) + pipeline = Pipeline(stages=[tokenizer_a, ngram_a]) + df_selected = pipeline.fit(df_selected).transform(df_selected) + df_selected = df_selected.withColumn(unsorted_col, df_selected[output_col]) + df_selected = df_selected.withColumn( + output_col, sort_array(df_selected[unsorted_col]) + ) + return df_selected + + elif transform == "sql_condition": + cond = feature_selection["condition"] + output_col = feature_selection["output_column"] + df_selected = df_selected.withColumn(output_col, expr(cond)) + return df_selected + + elif transform == "array": + input_cols = feature_selection["input_columns"] + output_col = feature_selection["output_column"] + df_selected = df_selected.withColumn(output_col, array(input_cols)) + return df_selected + + elif transform == "union": + col1, col2 = feature_selection["input_columns"] + output_col = feature_selection["output_column"] + + def union_list(list_a, list_b): + return list(set(list_a).union(set(list_b))) + + union_list_udf = udf(union_list, ArrayType(StringType())) + df_selected = df_selected.withColumn(output_col, union_list_udf(col1, col2)) + return df_selected + + elif transform == "hash": + input_col = feature_selection["input_column"] + count_col = feature_selection["output_column"] + "_count" + hash_array_col = feature_selection["output_column"] + df_selected = df_selected.where(f"size({input_col}) > 0") + count_vect = CountVectorizer(inputCol=input_col, outputCol=count_col) + lsh = MinHashLSH( + inputCol=count_col, + outputCol=hash_array_col, + numHashTables=feature_selection["number"], + seed=445123, + ) + # non_zero = udf(lambda v: v.numNonzeros() > 0, BooleanType()) + # hha_count_nonzero = hha_counts.where(non_zero(F.col("word_counts"))) + cv_model = count_vect.fit(df_selected) + df_transformed = cv_model.transform(df_selected) + lsh_model = lsh.fit(df_transformed) + df_selected = lsh_model.transform(df_transformed) + return df_selected + + elif transform == "soundex": + input_col = feature_selection["input_column"] + output_col = feature_selection["output_column"] + df_selected = df_selected.withColumn(output_col, soundex(input_col)) + return df_selected + + elif transform == "neighbor_aggregate": + return df_selected + # df_selected.createOrReplaceTempView("prepped_df_tmp") + # link_task.run_register_sql("hh_nbor_rank", t_ctx=feature_selection) + # link_task.run_register_sql("hh_nbor", t_ctx=feature_selection) + # df_selected = link_task.run_register_sql( + # None, template="attach_neighbor_col", t_ctx=feature_selection + # ) + # spark.catalog.dropTempView("prepped_df_tmp") + # spark.catalog.dropTempView("hh_nbor") + # spark.catalog.dropTempView("hh_nbor_rank") + + elif transform == "attach_family_col": + return df_selected + + elif transform == "related_individuals": + df_selected.createOrReplaceTempView("prepped_df_tmp") + df_selected = link_task.run_register_sql( + None, + template="attach_related_col", + t_ctx={ + "output_col": feature_selection["output_col"], + "input_col": feature_selection["input_col"], + "prepped_df": "prepped_df_tmp", + "family_id": feature_selection["family_id"], + "relate_col": feature_selection["relate_col"], + "top_code": feature_selection["top_code"], + "bottom_code": feature_selection["bottom_code"], + "id": id_col, + }, + ) + spark.catalog.dropTempView("prepped_df_tmp") + return df_selected + + elif transform == "related_individual_rows": + return df_selected + # df_selected.createOrReplaceTempView("prepped_df_tmp") + # relate_filter = ( + # feature_selection["filter_b"] + # if (not (is_a) and "filter_b" in feature_selection) + # else None + # ) + # df_selected = link_task.run_register_sql( + # None, + # template="attach_related_cols_as_rows", + # t_ctx={ + # "output_col": feature_selection["output_col"], + # "input_cols": feature_selection["input_cols"], + # "prepped_df": "prepped_df_tmp", + # "family_id": feature_selection["family_id"], + # "relate_col": feature_selection["relate_col"], + # "top_code": feature_selection["top_code"], + # "bottom_code": feature_selection["bottom_code"], + # "id": id_col, + # "filter": relate_filter, + # }, + # ) + # spark.catalog.dropTempView("prepped_df_tmp") + + elif transform == "popularity": + input_cols = feature_selection.get("input_cols", False) + output_col = feature_selection["output_col"] + + # this should be a dictionary key:col_name, value:integer to be used for range + range_col = feature_selection.get("range_col", False) + range_val = feature_selection.get("range_val", False) + + if range_col and range_val: + if input_cols: + window = ( + Window.partitionBy([df_selected[col] for col in input_cols]) + .orderBy(df_selected[range_col]) + .rangeBetween(-range_val, range_val) + ) + else: + window = Window.orderBy(df_selected[range_col]).rangeBetween( + -range_val, range_val + ) + else: + window = Window.partitionBy([df_selected[col] for col in input_cols]) + + df_selected = df_selected.select( + df_selected["*"], count(lit(1)).over(window).alias(output_col) + ) + return df_selected + + elif transform == "power": + input_col = feature_selection["input_col"] + output_col = feature_selection["output_col"] + exponent = feature_selection["exponent"] + df_selected = df_selected.select( + "*", pow(df_selected[input_col], exponent).alias(output_col) + ) + return df_selected + + elif transform == "attach_variable": + input_col = feature_selection["input_column"] # join key in core data + output_col = feature_selection[ + "output_column" + ] # desired alias for the added variable + col_to_join_on = feature_selection["col_to_join_on"] # join key in csv data + col_to_add = feature_selection["col_to_add"] # column to add from csv data + region_dict = feature_selection["region_dict"] # path to csv data file + null_filler = feature_selection["null_filler"] # value to replace null values + col_type = feature_selection["col_type"] + + df_selected.createOrReplaceTempView("prepped_df_tmp") + + # open up csv file + link_task.run_register_python( + name="region_data", + func=lambda: spark.read.csv(region_dict, header=True, inferSchema=True), + # persist=True, + ) + # self.spark.table("region_data").region.cast("int") + + # join the csv file to the dataframe (df_selected) + df_selected = link_task.run_register_sql( + None, + template="attach_variable", + t_ctx={ + "input_col": input_col, + "output_col": output_col, + "prepped_df": "prepped_df_tmp", + "col_to_join_on": col_to_join_on, + "col_to_add": col_to_add, + "region_data": "region_data", + }, + ) + df_selected = df_selected.fillna(null_filler, subset=[output_col]) + df_selected = df_selected.withColumn( + output_col, df_selected[output_col].cast(col_type) + ) + spark.catalog.dropTempView("prepped_df_tmp") + return df_selected + + else: + raise ValueError(f"Invalid transform type for {transform}") + + def generate_transforms( spark: SparkSession, df_selected: DataFrame, @@ -38,6 +336,16 @@ def generate_transforms( is_a: bool, id_col: str, ) -> DataFrame: + """Generate feature selection columns and return the input dataframe with these new columns attached. + + Args: + spark: the Spark session + df_selected: the input Spark DataFrame + feature_selections: a list of feature selections to compute + link_task: the current LinkTask + is_a: whether this is dataset A (True) or dataset B (False) + id_col: the name of the identifier column in the input data frame + """ not_skipped_feature_selections = [ c for c in feature_selections @@ -50,285 +358,17 @@ def generate_transforms( if ("post_agg_feature" in c) and c["post_agg_feature"] ] - def parse_feature_selections( - df_selected: DataFrame, feature_selection: dict[str, Any], is_a: bool - ) -> DataFrame: - transform = feature_selection["transform"] - - if not feature_selection.get("output_column", False): - feature_selection["output_column"] = feature_selection["output_col"] - - if "checkpoint" in feature_selection and feature_selection["checkpoint"]: - df_selected = df_selected.checkpoint() - - if "override_column_a" in feature_selection and is_a: - override_name = feature_selection["override_column_a"] - df_selected = df_selected.withColumn( - feature_selection["output_column"], df_selected[override_name] - ) - return df_selected - - elif "override_column_b" in feature_selection and not is_a: - override_name = feature_selection["override_column_b"] - df_selected = df_selected.withColumn( - feature_selection["output_column"], df_selected[override_name] - ) - return df_selected - - elif "set_value_column_a" in feature_selection and is_a: - set_value = feature_selection["set_value_column_a"] - df_selected = df_selected.withColumn( - feature_selection["output_column"], lit(set_value) - ) - return df_selected - - elif "set_value_column_b" in feature_selection and not is_a: - set_value = feature_selection["set_value_column_b"] - df_selected = df_selected.withColumn( - feature_selection["output_column"], lit(set_value) - ) - return df_selected - - elif transform == "bigrams": - input_col = feature_selection["input_column"] - output_col = feature_selection["output_column"] - intermediate_col = input_col + "_tokens" - unsorted_col = input_col + "_unsorted" - if ( - "no_first_pad" in feature_selection - and feature_selection["no_first_pad"] - ): - input_col_space = input_col - else: - input_col_space = input_col + "_space" - df_selected = df_selected.withColumn( - input_col_space, concat(lit(" "), input_col) - ) - tokenizer_a = RegexTokenizer( - pattern="", inputCol=input_col_space, outputCol=intermediate_col - ) - ngram_a = NGram(n=2, inputCol=intermediate_col, outputCol=output_col) - pipeline = Pipeline(stages=[tokenizer_a, ngram_a]) - df_selected = pipeline.fit(df_selected).transform(df_selected) - df_selected = df_selected.withColumn(unsorted_col, df_selected[output_col]) - df_selected = df_selected.withColumn( - output_col, sort_array(df_selected[unsorted_col]) - ) - return df_selected - - elif transform == "sql_condition": - cond = feature_selection["condition"] - output_col = feature_selection["output_column"] - df_selected = df_selected.withColumn(output_col, expr(cond)) - return df_selected - - elif transform == "array": - input_cols = feature_selection["input_columns"] - output_col = feature_selection["output_column"] - df_selected = df_selected.withColumn(output_col, array(input_cols)) - return df_selected - - elif transform == "union": - col1, col2 = feature_selection["input_columns"] - output_col = feature_selection["output_column"] - - def union_list(list_a, list_b): - return list(set(list_a).union(set(list_b))) - - union_list_udf = udf(union_list, ArrayType(StringType())) - df_selected = df_selected.withColumn(output_col, union_list_udf(col1, col2)) - return df_selected - - elif transform == "hash": - input_col = feature_selection["input_column"] - count_col = feature_selection["output_column"] + "_count" - hash_array_col = feature_selection["output_column"] - df_selected = df_selected.where(f"size({input_col}) > 0") - count_vect = CountVectorizer(inputCol=input_col, outputCol=count_col) - lsh = MinHashLSH( - inputCol=count_col, - outputCol=hash_array_col, - numHashTables=feature_selection["number"], - seed=445123, - ) - # non_zero = udf(lambda v: v.numNonzeros() > 0, BooleanType()) - # hha_count_nonzero = hha_counts.where(non_zero(F.col("word_counts"))) - cv_model = count_vect.fit(df_selected) - df_transformed = cv_model.transform(df_selected) - lsh_model = lsh.fit(df_transformed) - df_selected = lsh_model.transform(df_transformed) - return df_selected - - elif transform == "soundex": - input_col = feature_selection["input_column"] - output_col = feature_selection["output_column"] - df_selected = df_selected.withColumn(output_col, soundex(input_col)) - return df_selected - - elif transform == "neighbor_aggregate": - return df_selected - # df_selected.createOrReplaceTempView("prepped_df_tmp") - # link_task.run_register_sql("hh_nbor_rank", t_ctx=feature_selection) - # link_task.run_register_sql("hh_nbor", t_ctx=feature_selection) - # df_selected = link_task.run_register_sql( - # None, template="attach_neighbor_col", t_ctx=feature_selection - # ) - # spark.catalog.dropTempView("prepped_df_tmp") - # spark.catalog.dropTempView("hh_nbor") - # spark.catalog.dropTempView("hh_nbor_rank") - - elif transform == "attach_family_col": - return df_selected - - elif transform == "related_individuals": - df_selected.createOrReplaceTempView("prepped_df_tmp") - df_selected = link_task.run_register_sql( - None, - template="attach_related_col", - t_ctx={ - "output_col": feature_selection["output_col"], - "input_col": feature_selection["input_col"], - "prepped_df": "prepped_df_tmp", - "family_id": feature_selection["family_id"], - "relate_col": feature_selection["relate_col"], - "top_code": feature_selection["top_code"], - "bottom_code": feature_selection["bottom_code"], - "id": id_col, - }, - ) - spark.catalog.dropTempView("prepped_df_tmp") - return df_selected - - elif transform == "related_individual_rows": - return df_selected - # df_selected.createOrReplaceTempView("prepped_df_tmp") - # relate_filter = ( - # feature_selection["filter_b"] - # if (not (is_a) and "filter_b" in feature_selection) - # else None - # ) - # df_selected = link_task.run_register_sql( - # None, - # template="attach_related_cols_as_rows", - # t_ctx={ - # "output_col": feature_selection["output_col"], - # "input_cols": feature_selection["input_cols"], - # "prepped_df": "prepped_df_tmp", - # "family_id": feature_selection["family_id"], - # "relate_col": feature_selection["relate_col"], - # "top_code": feature_selection["top_code"], - # "bottom_code": feature_selection["bottom_code"], - # "id": id_col, - # "filter": relate_filter, - # }, - # ) - # spark.catalog.dropTempView("prepped_df_tmp") - - elif transform == "popularity": - input_cols = feature_selection.get("input_cols", False) - output_col = feature_selection["output_col"] - - # this should be a dictionary key:col_name, value:integer to be used for range - range_col = feature_selection.get("range_col", False) - range_val = feature_selection.get("range_val", False) - - if range_col and range_val: - if input_cols: - window = ( - Window.partitionBy([df_selected[col] for col in input_cols]) - .orderBy(df_selected[range_col]) - .rangeBetween(-range_val, range_val) - ) - else: - window = Window.orderBy(df_selected[range_col]).rangeBetween( - -range_val, range_val - ) - else: - window = Window.partitionBy([df_selected[col] for col in input_cols]) - - df_selected = df_selected.select( - df_selected["*"], count(lit(1)).over(window).alias(output_col) - ) - return df_selected - - elif transform == "power": - input_col = feature_selection["input_col"] - output_col = feature_selection["output_col"] - exponent = feature_selection["exponent"] - df_selected = df_selected.select( - "*", pow(df_selected[input_col], exponent).alias(output_col) - ) - return df_selected - - elif transform == "attach_variable": - input_col = feature_selection["input_column"] # join key in core data - output_col = feature_selection[ - "output_column" - ] # desired alias for the added variable - col_to_join_on = feature_selection["col_to_join_on"] # join key in csv data - col_to_add = feature_selection["col_to_add"] # column to add from csv data - region_dict = feature_selection["region_dict"] # path to csv data file - null_filler = feature_selection[ - "null_filler" - ] # value to replace null values - col_type = feature_selection["col_type"] - - df_selected.createOrReplaceTempView("prepped_df_tmp") - - # open up csv file - link_task.run_register_python( - name="region_data", - func=lambda: spark.read.csv(region_dict, header=True, inferSchema=True), - # persist=True, - ) - # self.spark.table("region_data").region.cast("int") - - # join the csv file to the dataframe (df_selected) - df_selected = link_task.run_register_sql( - None, - template="attach_variable", - t_ctx={ - "input_col": input_col, - "output_col": output_col, - "prepped_df": "prepped_df_tmp", - "col_to_join_on": col_to_join_on, - "col_to_add": col_to_add, - "region_data": "region_data", - }, - ) - df_selected = df_selected.fillna(null_filler, subset=[output_col]) - df_selected = df_selected.withColumn( - output_col, df_selected[output_col].cast(col_type) - ) - spark.catalog.dropTempView("prepped_df_tmp") - return df_selected - - else: - raise ValueError(f"Invalid transform type for {transform}") - for feature_selection in not_skipped_feature_selections: - df_selected = parse_feature_selections(df_selected, feature_selection, is_a) - - def get_transforms(name: str, is_a: bool) -> list[dict[str, Any]]: - to_process = [] - for f in not_skipped_feature_selections: - if ("override_column_a" in f) and is_a: - pass - elif ("override_column_b" in f) and not is_a: - pass - elif ("set_value_column_a" in f) and is_a: - pass - elif ("set_value_column_b" in f) and not is_a: - pass - elif f["transform"] == name: - to_process.append(f) - - return to_process + df_selected = _parse_feature_selections( + spark, link_task, df_selected, feature_selection, id_col, is_a + ) hh_transforms = [ - get_transforms("attach_family_col", is_a), - get_transforms("related_individual_rows", is_a), - get_transforms("neighbor_aggregate", is_a), + _get_transforms(not_skipped_feature_selections, "attach_family_col", is_a), + _get_transforms( + not_skipped_feature_selections, "related_individual_rows", is_a + ), + _get_transforms(not_skipped_feature_selections, "neighbor_aggregate", is_a), ] if any(hh_transforms): attach_ts, related_ts, neighbor_ts = hh_transforms @@ -397,7 +437,9 @@ def get_transforms(name: str, is_a: bool) -> list[dict[str, Any]]: ) for feature_selection in post_agg_feature_selections: - df_selected = parse_feature_selections(df_selected, feature_selection, is_a) + df_selected = _parse_feature_selections( + spark, link_task, df_selected, feature_selection, id_col, is_a + ) return df_selected diff --git a/hlink/tests/core/transforms_test.py b/hlink/tests/core/transforms_test.py index f072333..48b5ce0 100644 --- a/hlink/tests/core/transforms_test.py +++ b/hlink/tests/core/transforms_test.py @@ -195,6 +195,54 @@ def test_generate_transforms_override_column_b( ] +@pytest.mark.parametrize("is_a", [True, False]) +def test_generate_transforms_skip_attribute_skips_transform( + spark: SparkSession, preprocessing: LinkTask, is_a: bool +) -> None: + """When a feature selection has an attribute "skip" set to True, + generate_transforms() ignores it and doesn't include it in the output data + frame. + """ + feature_selections = [ + { + "input_column": "name", + "output_column": "name_bigrams", + "transform": "bigrams", + "skip": True, + } + ] + + df = spark.createDataFrame([[0, "martin"]], "id:integer, name:string") + df_result = generate_transforms( + spark, df, feature_selections, preprocessing, is_a, "id" + ) + # There's no output "name_bigrams" column because the feature selection was skipped + assert df_result.columns == ["id", "name"] + + +@pytest.mark.parametrize("is_a", [True, False]) +def test_generate_transforms_skip_attribute_does_not_skip_if_false( + spark: SparkSession, preprocessing: LinkTask, is_a: bool +) -> None: + """When a feature selection has an attribute "skip", but it's set to False, + generate_transforms() computes the feature selection as normal. + """ + feature_selections = [ + { + "input_column": "name", + "output_column": "name_bigrams", + "transform": "bigrams", + "skip": False, + } + ] + + df = spark.createDataFrame([[0, "martin"]], "id:integer, name:string") + df_result = generate_transforms( + spark, df, feature_selections, preprocessing, is_a, "id" + ) + assert "name_bigrams" in df_result.columns + + @pytest.mark.parametrize("is_a", [True, False]) def test_generate_transforms_error_when_unrecognized_transform( spark: SparkSession, preprocessing: LinkTask, is_a: bool diff --git a/sphinx-docs/feature_selection_transforms.md b/sphinx-docs/feature_selection_transforms.md index 3f89ee1..0cf35b0 100644 --- a/sphinx-docs/feature_selection_transforms.md +++ b/sphinx-docs/feature_selection_transforms.md @@ -21,6 +21,9 @@ few utility attributes which are available for all transforms: - `checkpoint` - Type: `boolean`. Optional. If set to true, checkpoint the dataset in Spark before computing the feature selection. This can reduce some resource usage for very complex workflows, but should not be necessary. +- `skip` - Type: `boolean`. Optional. If set to true, don't compute this + feature selection. This has the same effect as commenting the feature + selection out of your config file. ## bigrams