diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index 0653279b..f987552a 100755 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "9bd01afc", + "id": "07156c67", "metadata": {}, "source": [ "# Nemo Curator Pipeline Example\n", @@ -27,7 +27,7 @@ }, { "cell_type": "markdown", - "id": "7b1808ea", + "id": "9e693ac6", "metadata": {}, "source": [ "## About this notebook\n", @@ -54,7 +54,7 @@ }, { "cell_type": "markdown", - "id": "78537bd7", + "id": "1dbd784d", "metadata": {}, "source": [ "## Prerequisites\n", @@ -83,7 +83,7 @@ }, { "cell_type": "markdown", - "id": "062b5423", + "id": "f0d543a4", "metadata": {}, "source": [ "## 0. Env Setup" @@ -92,7 +92,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "8add9bbd", + "id": "e9274d20", "metadata": {}, "outputs": [ { @@ -114,7 +114,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "9940c70d", + "id": "2852006b", "metadata": {}, "outputs": [], "source": [ @@ -143,7 +143,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "fd8a381d", + "id": "cc537a19", "metadata": {}, "outputs": [], "source": [ @@ -172,7 +172,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "589ff257", + "id": "c148e0e7", "metadata": {}, "outputs": [ { @@ -191,7 +191,7 @@ }, { "cell_type": "markdown", - "id": "662d505f", + "id": "b48397aa", "metadata": {}, "source": [ "## 1. Download\n", @@ -233,7 +233,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "adb59379", + "id": "a1ae9fe5", "metadata": {}, "outputs": [], "source": [ @@ -242,7 +242,7 @@ }, { "cell_type": "markdown", - "id": "9b56f12a", + "id": "983b4433", "metadata": {}, "source": [ " Start a CPU based Dask cluster. Please modify `n_workers` and `memory_limit` according to your hardware specification. To process TH wikipedia data, it's advised to have `memory_limit` greater than 12GB" @@ -251,7 +251,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "e822b5ac", + "id": "1e98fe2c", "metadata": {}, "outputs": [], "source": [ @@ -261,7 +261,7 @@ }, { "cell_type": "markdown", - "id": "e90cc8b1", + "id": "b7bcc8e2", "metadata": {}, "source": [ "Define parameters" @@ -270,7 +270,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "9a03b463", + "id": "c2e8f5fe", "metadata": {}, "outputs": [], "source": [ @@ -286,7 +286,7 @@ }, { "cell_type": "markdown", - "id": "f41734a1", + "id": "219a7c67", "metadata": {}, "source": [ "Download TH wikipedia data" @@ -295,7 +295,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a45965a7", + "id": "a15b257b", "metadata": {}, "outputs": [], "source": [ @@ -307,7 +307,7 @@ }, { "cell_type": "markdown", - "id": "22b7d5b3", + "id": "039a2d64", "metadata": {}, "source": [ "Verify result" @@ -316,7 +316,7 @@ { "cell_type": "code", "execution_count": 26, - "id": "45a69041", + "id": "0655085d", "metadata": {}, "outputs": [ { @@ -336,7 +336,7 @@ { "cell_type": "code", "execution_count": 27, - "id": "53bdccfd", + "id": "bc79a6b1", "metadata": {}, "outputs": [ { @@ -354,7 +354,7 @@ }, { "cell_type": "markdown", - "id": "c5f58643", + "id": "6cc71627", "metadata": {}, "source": [ "**[Optional]**Close the Dask cluster.You might encounter error such as `Caught signal 11`.It's OK, just rerun the cell again." @@ -363,7 +363,7 @@ { "cell_type": "code", "execution_count": 28, - "id": "0669a830", + "id": "772beb67", "metadata": {}, "outputs": [], "source": [ @@ -373,7 +373,7 @@ }, { "cell_type": "markdown", - "id": "43334988", + "id": "b7deff3c", "metadata": {}, "source": [ "## 2.Language separation and unicode fixing" @@ -381,7 +381,7 @@ }, { "cell_type": "markdown", - "id": "86ccdc1f", + "id": "8c62d002", "metadata": {}, "source": [ "In this section, we will be using a language classification model by fasttext to separate the TH wikipedia dataset based on the document major languages, and we will also fix the unicode in the documents. Detailed steps are:\n", @@ -397,7 +397,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "1e9198e8", + "id": "b980634f", "metadata": {}, "outputs": [], "source": [ @@ -408,7 +408,7 @@ }, { "cell_type": "markdown", - "id": "76e46d2a", + "id": "93f44200", "metadata": {}, "source": [ "**[Optional]** Start a cpu based Dask cluster." @@ -417,7 +417,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "da3aed8a", + "id": "37199297", "metadata": {}, "outputs": [], "source": [ @@ -427,7 +427,7 @@ }, { "cell_type": "markdown", - "id": "4a72479c", + "id": "cdbffdd4", "metadata": {}, "source": [ "Define parameters" @@ -436,7 +436,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "13b9d2b1", + "id": "c591b7ce", "metadata": {}, "outputs": [], "source": [ @@ -461,7 +461,7 @@ }, { "cell_type": "markdown", - "id": "8df0322a", + "id": "103ea67e", "metadata": {}, "source": [ "Download fasttext model" @@ -470,7 +470,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "2666727d", + "id": "72371a81", "metadata": {}, "outputs": [ { @@ -497,7 +497,7 @@ }, { "cell_type": "markdown", - "id": "58452516", + "id": "77786906", "metadata": {}, "source": [ "Apply fasttext model to separate documents by their languages" @@ -506,7 +506,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "d8b8c491", + "id": "7961975a", "metadata": {}, "outputs": [ { @@ -553,7 +553,7 @@ }, { "cell_type": "markdown", - "id": "d443a5d1", + "id": "0b1c3bf5", "metadata": {}, "source": [ "Load `UnicodeReformatter` to reformat any unicode appeared in the desired language dataset" @@ -562,7 +562,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "272a5f67", + "id": "c76ba9b3", "metadata": {}, "outputs": [ { @@ -593,7 +593,7 @@ }, { "cell_type": "markdown", - "id": "9bd57a53", + "id": "fbfb9c7c", "metadata": {}, "source": [ "Verify the result. We can see that some documents has been removed from TH wikipedia dataset since the number of lines in this output file is less than the original file (no. of lines = 162164)" @@ -602,7 +602,7 @@ { "cell_type": "code", "execution_count": 13, - "id": "e3329c83", + "id": "83594e02", "metadata": {}, "outputs": [ { @@ -621,7 +621,7 @@ }, { "cell_type": "markdown", - "id": "0b6cbc26", + "id": "362cad99", "metadata": {}, "source": [ "Furthur verify by loading documents that has been identified as other language, such as 'EN'. We can see from output that the removed document is indeed in English and contains very little or even no Thai." @@ -630,7 +630,7 @@ { "cell_type": "code", "execution_count": 38, - "id": "050d944c", + "id": "2de2bff8", "metadata": {}, "outputs": [ { @@ -648,7 +648,7 @@ }, { "cell_type": "markdown", - "id": "7d17f010", + "id": "16f1dd76", "metadata": {}, "source": [ "**[Optional]** Close the Dask cluster." @@ -657,7 +657,7 @@ { "cell_type": "code", "execution_count": 37, - "id": "7e64cc35", + "id": "fef7c3f4", "metadata": {}, "outputs": [], "source": [ @@ -667,7 +667,7 @@ }, { "cell_type": "markdown", - "id": "1d46cece", + "id": "b0a3bd0e", "metadata": {}, "source": [ "## 3.Add ID\n", @@ -680,7 +680,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "5f788b91", + "id": "13b75945", "metadata": {}, "outputs": [], "source": [ @@ -689,7 +689,7 @@ }, { "cell_type": "markdown", - "id": "cd17be33", + "id": "5b50ace5", "metadata": {}, "source": [ "**[Optional]** If there is no running Dask cluster, start CPU based Dask cluster." @@ -698,7 +698,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "5ba1d54a", + "id": "6a2a8ac0", "metadata": {}, "outputs": [], "source": [ @@ -708,7 +708,7 @@ }, { "cell_type": "markdown", - "id": "12f59d5e", + "id": "fa7b661e", "metadata": {}, "source": [ "Define relevant parameters" @@ -717,7 +717,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "843eba7f", + "id": "aa5ca4a3", "metadata": {}, "outputs": [], "source": [ @@ -733,7 +733,7 @@ }, { "cell_type": "markdown", - "id": "e7a8307c", + "id": "50e99470", "metadata": {}, "source": [ "Adding ID to dataset" @@ -742,7 +742,7 @@ { "cell_type": "code", "execution_count": 17, - "id": "b7a91bf1", + "id": "ba122faf", "metadata": {}, "outputs": [ { @@ -772,7 +772,7 @@ }, { "cell_type": "markdown", - "id": "e92b5dab", + "id": "8523fe53", "metadata": {}, "source": [ "Verify the result. From the output, we can see that the `id` value has been changed to `TH_wiki-0000000000` " @@ -781,7 +781,7 @@ { "cell_type": "code", "execution_count": 18, - "id": "e585cedd", + "id": "efa34cf2", "metadata": {}, "outputs": [ { @@ -799,7 +799,7 @@ }, { "cell_type": "markdown", - "id": "0cbddf6e", + "id": "97e4ac0d", "metadata": {}, "source": [ "Close Dask cluster. This cell needs to be run as we are starting a new GPU Dask cluster in the following task" @@ -808,7 +808,7 @@ { "cell_type": "code", "execution_count": 20, - "id": "4daa1f2a", + "id": "cd84e1f5", "metadata": {}, "outputs": [], "source": [ @@ -818,7 +818,7 @@ }, { "cell_type": "markdown", - "id": "1baf027e", + "id": "be17d85b", "metadata": {}, "source": [ "## 4.Exact Dedplication\n", @@ -835,7 +835,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "3f7ba34c", + "id": "9a95cdf7", "metadata": {}, "outputs": [], "source": [ @@ -844,7 +844,7 @@ }, { "cell_type": "markdown", - "id": "e268cfca", + "id": "49af3613", "metadata": {}, "source": [ "Start a GPU based Dask cluster. Since GPU based Dask cluster involves setting several arguments, we will use the `get_client()` wrapper function to quickly set up. " @@ -853,7 +853,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "4b73e5f9", + "id": "ab34f1ee", "metadata": {}, "outputs": [ { @@ -882,7 +882,7 @@ }, { "cell_type": "markdown", - "id": "0fc99440", + "id": "af273aec", "metadata": {}, "source": [ "If you encounter the following error\n", @@ -894,7 +894,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "a590c78a", + "id": "7981830e", "metadata": {}, "outputs": [], "source": [ @@ -903,7 +903,7 @@ }, { "cell_type": "markdown", - "id": "0151abe0", + "id": "66874211", "metadata": {}, "source": [ "Define parameters" @@ -912,7 +912,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "54b627a4", + "id": "461fe0b9", "metadata": {}, "outputs": [], "source": [ @@ -932,7 +932,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "6ede2e41", + "id": "d9c2a0b0", "metadata": {}, "outputs": [], "source": [ @@ -942,7 +942,7 @@ }, { "cell_type": "markdown", - "id": "1882204a", + "id": "69f94a52", "metadata": {}, "source": [ "Apply exact deduplication" @@ -951,7 +951,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "dfaaa765", + "id": "178bc429", "metadata": {}, "outputs": [ { @@ -1000,7 +1000,7 @@ }, { "cell_type": "markdown", - "id": "e68f0399", + "id": "8e33f1f4", "metadata": {}, "source": [ "Verify the output duplicated ID. We can group by the `_hashes` to get the list of duplicated documents having the same _hashes and use `extract_lines_with_id()` to verify that those documents are indeed exact duplicates. Please note that the `id` might changes, therefore, please replace the `target_list` when necessary" @@ -1009,7 +1009,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "28d8bb0b", + "id": "d11b13bd", "metadata": {}, "outputs": [ { @@ -1097,7 +1097,7 @@ { "cell_type": "code", "execution_count": 16, - "id": "fca41870", + "id": "a54073fb", "metadata": {}, "outputs": [ { @@ -1183,7 +1183,7 @@ { "cell_type": "code", "execution_count": 17, - "id": "8c9624ac", + "id": "b9d1a4fc", "metadata": {}, "outputs": [ { @@ -1203,7 +1203,7 @@ }, { "cell_type": "markdown", - "id": "4013203c", + "id": "edb88ba9", "metadata": {}, "source": [ "**[Optional]** You might choose to close Dask cluster here" @@ -1212,7 +1212,7 @@ { "cell_type": "code", "execution_count": 13, - "id": "5ef2f05e", + "id": "83f6ff78", "metadata": {}, "outputs": [], "source": [ @@ -1222,7 +1222,7 @@ }, { "cell_type": "markdown", - "id": "7a2feadc", + "id": "a0ccaca6", "metadata": {}, "source": [ "## 5. Fuzzy Deduplication\n", @@ -1254,7 +1254,7 @@ }, { "cell_type": "markdown", - "id": "ffca14ad", + "id": "7a67ba83", "metadata": {}, "source": [ "**If there is not running Dask cluster, start a GPU Dask cluster here**" @@ -1263,7 +1263,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e00ba2fd", + "id": "45eb269a", "metadata": {}, "outputs": [], "source": [ @@ -1274,7 +1274,7 @@ }, { "cell_type": "markdown", - "id": "5df73743", + "id": "bf4b3f77", "metadata": {}, "source": [ "### 5.1 Minhash\n", @@ -1299,7 +1299,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "1fc5bff3", + "id": "81f8c2a4", "metadata": {}, "outputs": [], "source": [ @@ -1308,7 +1308,7 @@ }, { "cell_type": "markdown", - "id": "7bf9cc8d", + "id": "f584bfeb", "metadata": {}, "source": [ "Define parameters" @@ -1317,7 +1317,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "d600d1b8", + "id": "f8453a1e", "metadata": {}, "outputs": [], "source": [ @@ -1345,7 +1345,7 @@ }, { "cell_type": "markdown", - "id": "1c31ddf4", + "id": "db343fad", "metadata": {}, "source": [ "Run MinHash" @@ -1354,7 +1354,7 @@ { "cell_type": "code", "execution_count": 13, - "id": "88540950", + "id": "0c5c0484", "metadata": {}, "outputs": [ { @@ -1414,7 +1414,7 @@ }, { "cell_type": "markdown", - "id": "158bf3ab", + "id": "53d67a08", "metadata": {}, "source": [ "Verify result" @@ -1423,7 +1423,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "10b5eb55", + "id": "e038468c", "metadata": {}, "outputs": [ { @@ -1502,7 +1502,7 @@ }, { "cell_type": "markdown", - "id": "0bce0f80", + "id": "af9a87c1", "metadata": {}, "source": [ "### 5.2 LSH\n", @@ -1524,7 +1524,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "645b8a53", + "id": "c7636651", "metadata": {}, "outputs": [], "source": [ @@ -1535,7 +1535,7 @@ }, { "cell_type": "markdown", - "id": "110db216", + "id": "a1d7b045", "metadata": {}, "source": [ "Define parameters" @@ -1544,7 +1544,7 @@ { "cell_type": "code", "execution_count": 16, - "id": "738ab265", + "id": "80f8ca9f", "metadata": {}, "outputs": [], "source": [ @@ -1569,7 +1569,7 @@ }, { "cell_type": "markdown", - "id": "a5250a2a", + "id": "bf80e665", "metadata": {}, "source": [ "Run LSH" @@ -1578,7 +1578,7 @@ { "cell_type": "code", "execution_count": 17, - "id": "1ef61e2b", + "id": "381728cb", "metadata": {}, "outputs": [ { @@ -1628,7 +1628,7 @@ }, { "cell_type": "markdown", - "id": "ad2e3b60", + "id": "ff2c8bb7", "metadata": {}, "source": [ "Verify result" @@ -1637,7 +1637,7 @@ { "cell_type": "code", "execution_count": 18, - "id": "9d0449c6", + "id": "d91cbee8", "metadata": {}, "outputs": [ { @@ -1722,7 +1722,7 @@ }, { "cell_type": "markdown", - "id": "f952f074", + "id": "d63827e3", "metadata": {}, "source": [ "### 5.3 Jaccard Shuffle\n", @@ -1744,7 +1744,7 @@ { "cell_type": "code", "execution_count": 19, - "id": "707ea54d", + "id": "a8beba77", "metadata": {}, "outputs": [], "source": [ @@ -1757,7 +1757,7 @@ }, { "cell_type": "markdown", - "id": "8f2e321d", + "id": "83122f55", "metadata": {}, "source": [ "Define parameters" @@ -1766,7 +1766,7 @@ { "cell_type": "code", "execution_count": 25, - "id": "70e2dff9", + "id": "0459e58b", "metadata": {}, "outputs": [], "source": [ @@ -1800,7 +1800,7 @@ }, { "cell_type": "markdown", - "id": "d0f19efa", + "id": "bd8e08ba", "metadata": {}, "source": [ "Run Jaccard map bucket" @@ -1809,7 +1809,7 @@ { "cell_type": "code", "execution_count": 26, - "id": "b2850b0a", + "id": "b6e938f2", "metadata": {}, "outputs": [ { @@ -1833,23 +1833,24 @@ " blocksize=text_ddf_blocksize,\n", " id_column=input_id_field,\n", " text_column=input_text_field,\n", + " input_meta=None\n", ")\n", + "\n", "# Read \"_buckets.parquet\"\n", "ddf_bk = get_bucket_ddf_from_parquet_path(input_bucket_path=input_bucket_path, num_workers=num_workers)\n", "\n", "#Run _MapBuckets()\n", - "map_buckets = _MapBuckets(id_fields=shuffle_id_fields, bucket_field=input_bucket_field, logger=jaccard_shuffle_log_path)\n", + "map_buckets = _MapBuckets(id_fields=shuffle_id_fields, bucket_field=input_bucket_field, logger=jaccard_shuffle_log_path, text_field=input_text_field)\n", "ddf_anchor_docs_with_bk = map_buckets.map_buckets_with_anchors(documents_df=ddf_text, buckets_df=ddf_bk, shuffle_type=shuffle_type)\n", "\n", "#Write to disk\n", "ddf_anchor_docs_with_bk.to_parquet(output_anchor_docs_with_bk_path, write_index=False)\n", - "\n", "print(f\"Time taken for Bucket Mapping:{time.time()-t0} s\")" ] }, { "cell_type": "markdown", - "id": "a1533a15", + "id": "554b20b7", "metadata": {}, "source": [ "Verify result" @@ -1858,7 +1859,7 @@ { "cell_type": "code", "execution_count": 27, - "id": "d74012c3", + "id": "7cdb5363", "metadata": {}, "outputs": [ { @@ -1974,7 +1975,7 @@ }, { "cell_type": "markdown", - "id": "1487b1ad", + "id": "9b89bfbb", "metadata": {}, "source": [ "**[Optional]** Remove previous Jaccard Shuffle results. Run only when there are files under the Jaccard Shuffle output path" @@ -1983,7 +1984,7 @@ { "cell_type": "code", "execution_count": 30, - "id": "b414f703", + "id": "3b40f33c", "metadata": {}, "outputs": [], "source": [ @@ -1992,7 +1993,7 @@ }, { "cell_type": "markdown", - "id": "f33a6782", + "id": "019eb2f0", "metadata": {}, "source": [ "Run Jaccard Shuffle" @@ -2001,7 +2002,7 @@ { "cell_type": "code", "execution_count": 31, - "id": "86d1b3e5", + "id": "5fbefc47", "metadata": {}, "outputs": [ { @@ -2071,7 +2072,7 @@ }, { "cell_type": "markdown", - "id": "86b06cb5", + "id": "3651fc00", "metadata": {}, "source": [ "Verify result" @@ -2080,7 +2081,7 @@ { "cell_type": "code", "execution_count": 32, - "id": "1b51a5fb", + "id": "62c0cff5", "metadata": {}, "outputs": [ { @@ -2184,7 +2185,7 @@ }, { "cell_type": "markdown", - "id": "b8644e51", + "id": "d1a2e7d3", "metadata": {}, "source": [ "### 5.4 Jaccard Compute\n", @@ -2200,7 +2201,7 @@ { "cell_type": "code", "execution_count": 33, - "id": "b1a532a2", + "id": "eab94c82", "metadata": {}, "outputs": [], "source": [ @@ -2209,7 +2210,7 @@ }, { "cell_type": "markdown", - "id": "c9e65975", + "id": "685b1661", "metadata": {}, "source": [ "Define parameters" @@ -2218,7 +2219,7 @@ { "cell_type": "code", "execution_count": 34, - "id": "291d3aaa", + "id": "aa33c7c9", "metadata": {}, "outputs": [], "source": [ @@ -2240,7 +2241,7 @@ }, { "cell_type": "markdown", - "id": "9341b58c", + "id": "924b570d", "metadata": {}, "source": [ "Run Jaccard Compute" @@ -2249,7 +2250,7 @@ { "cell_type": "code", "execution_count": 35, - "id": "9b1b9bdd", + "id": "d384e26d", "metadata": {}, "outputs": [ { @@ -2285,7 +2286,7 @@ }, { "cell_type": "markdown", - "id": "bb740d30", + "id": "73b90e38", "metadata": {}, "source": [ "Verify output. You might see that there are repeated `id_x` and `id_y` pairs. This is expected as a pair of similar documents is likely to share numerous same buckets." @@ -2294,7 +2295,7 @@ { "cell_type": "code", "execution_count": 36, - "id": "a41d1f09", + "id": "3094049a", "metadata": {}, "outputs": [ { @@ -2379,7 +2380,7 @@ }, { "cell_type": "markdown", - "id": "a505402e", + "id": "436886b7", "metadata": {}, "source": [ "### 5.5 Connected Components\n", @@ -2395,7 +2396,7 @@ { "cell_type": "code", "execution_count": 37, - "id": "3bff521b", + "id": "6abec527", "metadata": {}, "outputs": [], "source": [ @@ -2404,7 +2405,7 @@ }, { "cell_type": "markdown", - "id": "d8afed6a", + "id": "7b6dc1a6", "metadata": {}, "source": [ "Define parameters" @@ -2413,7 +2414,7 @@ { "cell_type": "code", "execution_count": 38, - "id": "b40735dd", + "id": "b1542ec5", "metadata": {}, "outputs": [], "source": [ @@ -2434,7 +2435,7 @@ }, { "cell_type": "markdown", - "id": "33d8957f", + "id": "b6bfcb2e", "metadata": {}, "source": [ "Run Connected Component" @@ -2443,7 +2444,7 @@ { "cell_type": "code", "execution_count": 39, - "id": "fe62dd51", + "id": "e5de8d51", "metadata": {}, "outputs": [ { @@ -2476,7 +2477,7 @@ }, { "cell_type": "markdown", - "id": "669495ee", + "id": "cc8bee9e", "metadata": {}, "source": [ "Verify the result of `Connected Components`" @@ -2485,7 +2486,7 @@ { "cell_type": "code", "execution_count": 40, - "id": "efbd6973", + "id": "bce6dd64", "metadata": {}, "outputs": [ { @@ -2570,7 +2571,7 @@ }, { "cell_type": "markdown", - "id": "0c3e2bdc", + "id": "91f8227d", "metadata": {}, "source": [ "Let's check if the output fuzzy duplicated documents within the same group are similar. Please note that the `group` id in your output might be different from the notebook output." @@ -2579,7 +2580,7 @@ { "cell_type": "code", "execution_count": 54, - "id": "d8fa1e8e", + "id": "5545e73b", "metadata": {}, "outputs": [ { @@ -2697,7 +2698,7 @@ }, { "cell_type": "markdown", - "id": "f34b8140", + "id": "c8ab327e", "metadata": {}, "source": [ "Change the `group` number if necessary. By running the code below, we can obtain a list of near duplicated documents." @@ -2706,7 +2707,7 @@ { "cell_type": "code", "execution_count": 55, - "id": "fd01f5fe", + "id": "12d547e5", "metadata": {}, "outputs": [ { @@ -2790,7 +2791,7 @@ }, { "cell_type": "markdown", - "id": "99a8d732", + "id": "70750a50", "metadata": {}, "source": [ "Print the text of near duplicated document. Please replace the `id` if necessary, `id` should be in the format of `_`" @@ -2799,7 +2800,7 @@ { "cell_type": "code", "execution_count": 73, - "id": "68883f58", + "id": "3abaf248", "metadata": {}, "outputs": [ { @@ -2821,7 +2822,7 @@ }, { "cell_type": "markdown", - "id": "3b6578b4", + "id": "ee0b0ed7", "metadata": {}, "source": [ "Below is the English translation of the output above. We can see that the two documents are indeed very similar to each other.\n", @@ -2882,7 +2883,7 @@ }, { "cell_type": "markdown", - "id": "f36436f3", + "id": "cdbb5091", "metadata": {}, "source": [ "### 5.6 Fuzzy deduplication wrapper" @@ -2891,7 +2892,7 @@ { "cell_type": "code", "execution_count": 56, - "id": "eb52ec06", + "id": "2727fc33", "metadata": {}, "outputs": [], "source": [ @@ -2901,7 +2902,7 @@ { "cell_type": "code", "execution_count": 57, - "id": "625c1828", + "id": "feeba181", "metadata": {}, "outputs": [], "source": [ @@ -2928,7 +2929,7 @@ }, { "cell_type": "markdown", - "id": "cb76d8e5", + "id": "a6352ba1", "metadata": {}, "source": [ "**[Optional]** If the cache folder is not empty, please CLEAR the folder before proceeding" @@ -2937,7 +2938,7 @@ { "cell_type": "code", "execution_count": 59, - "id": "e7fb4c4c", + "id": "b7c8d95d", "metadata": {}, "outputs": [], "source": [ @@ -2947,7 +2948,7 @@ { "cell_type": "code", "execution_count": 60, - "id": "2368443f", + "id": "db5230ee", "metadata": {}, "outputs": [ { @@ -3052,7 +3053,7 @@ { "cell_type": "code", "execution_count": 61, - "id": "14bfe3bc", + "id": "ea0faaf5", "metadata": {}, "outputs": [ { @@ -3131,7 +3132,7 @@ }, { "cell_type": "markdown", - "id": "d2726cf9", + "id": "9505c5db", "metadata": {}, "source": [ "## 6. Remove duplicates\n", @@ -3141,7 +3142,7 @@ }, { "cell_type": "markdown", - "id": "e4dd78db", + "id": "600c7ae0", "metadata": {}, "source": [ "Define parameters" @@ -3150,7 +3151,7 @@ { "cell_type": "code", "execution_count": 81, - "id": "0027c8d2", + "id": "ee33b4d8", "metadata": {}, "outputs": [], "source": [ @@ -3169,7 +3170,7 @@ }, { "cell_type": "markdown", - "id": "a373860d", + "id": "10bc124d", "metadata": {}, "source": [ "We will first process the result of exact deduplication. Since result of exact deduplication contains original ID used in input dataset, it is more straightforward to deal with." @@ -3178,7 +3179,7 @@ { "cell_type": "code", "execution_count": 82, - "id": "f59e92c3", + "id": "6540ba39", "metadata": {}, "outputs": [ { @@ -3208,7 +3209,7 @@ }, { "cell_type": "markdown", - "id": "f55d6737", + "id": "f6eafc45", "metadata": {}, "source": [ "For result of fuzzy deduplication, we need to first reconstructed document ID by combining `dataset_id` and `doc_id`, then use the reconstructed `ID` for removal" @@ -3216,7 +3217,7 @@ }, { "cell_type": "markdown", - "id": "3b9c122d", + "id": "ba38d672", "metadata": {}, "source": [ "**[Optional]** Uncomment the cell to use result from step by step fuzzy deduplication" @@ -3225,7 +3226,7 @@ { "cell_type": "code", "execution_count": 83, - "id": "c6a1bb0a", + "id": "70cf2e44", "metadata": {}, "outputs": [], "source": [ @@ -3254,7 +3255,7 @@ { "cell_type": "code", "execution_count": 84, - "id": "746d3673", + "id": "ead56250", "metadata": {}, "outputs": [], "source": [ @@ -3268,7 +3269,7 @@ { "cell_type": "code", "execution_count": 85, - "id": "62b34838", + "id": "6fe61b2c", "metadata": {}, "outputs": [], "source": [ @@ -3281,7 +3282,7 @@ }, { "cell_type": "markdown", - "id": "edfa52ce", + "id": "948ebb69", "metadata": {}, "source": [ "Verify the result of duplicate removal. We can see that the number of document in resultant document is less than the original dataset (length = 161748)" @@ -3290,7 +3291,7 @@ { "cell_type": "code", "execution_count": 86, - "id": "78eee9b3", + "id": "596bc3e8", "metadata": {}, "outputs": [ { @@ -3308,7 +3309,7 @@ }, { "cell_type": "markdown", - "id": "15e07a32", + "id": "533a6f45", "metadata": {}, "source": [ "Close the GPU Dask Cluster.You might encounter error such as `Caught signal 11`.It's OK, just rerun the cell again." @@ -3317,7 +3318,7 @@ { "cell_type": "code", "execution_count": 88, - "id": "8e807bd7", + "id": "5a5c25e8", "metadata": {}, "outputs": [], "source": [ @@ -3327,7 +3328,7 @@ }, { "cell_type": "markdown", - "id": "a416a293", + "id": "b2f56151", "metadata": {}, "source": [ "## 7. Heuristic Fitlering\n", @@ -3350,7 +3351,7 @@ { "cell_type": "code", "execution_count": 89, - "id": "b988ad1e", + "id": "40c55ee5", "metadata": {}, "outputs": [], "source": [ @@ -3361,7 +3362,7 @@ }, { "cell_type": "markdown", - "id": "097a1b48", + "id": "6dcbc03b", "metadata": {}, "source": [ "**[Optional]** The following cell is to remove warning from dask." @@ -3370,7 +3371,7 @@ { "cell_type": "code", "execution_count": 90, - "id": "44552288", + "id": "f40570fd", "metadata": {}, "outputs": [], "source": [ @@ -3382,7 +3383,7 @@ }, { "cell_type": "markdown", - "id": "9a59699d", + "id": "52cfb548", "metadata": {}, "source": [ "Create a CPU Dask Cluster." @@ -3391,7 +3392,7 @@ { "cell_type": "code", "execution_count": 91, - "id": "b8f80ab3", + "id": "c5e9ca43", "metadata": {}, "outputs": [], "source": [ @@ -3401,7 +3402,7 @@ }, { "cell_type": "markdown", - "id": "a7702918", + "id": "9b6d0be0", "metadata": {}, "source": [ "Define some helper functions" @@ -3410,7 +3411,7 @@ { "cell_type": "code", "execution_count": 92, - "id": "6f2e7523", + "id": "eaac3be7", "metadata": {}, "outputs": [], "source": [ @@ -3441,7 +3442,7 @@ }, { "cell_type": "markdown", - "id": "227fa8b0", + "id": "4ad1d01c", "metadata": {}, "source": [ "Define parameters" @@ -3450,7 +3451,7 @@ { "cell_type": "code", "execution_count": 93, - "id": "a894f90f", + "id": "3b9e0612", "metadata": {}, "outputs": [], "source": [ @@ -3480,7 +3481,7 @@ }, { "cell_type": "markdown", - "id": "ccea406e", + "id": "92c76ae9", "metadata": {}, "source": [ "Run heuristic filtering" @@ -3489,7 +3490,7 @@ { "cell_type": "code", "execution_count": 94, - "id": "03b3da27", + "id": "b17d092f", "metadata": {}, "outputs": [ { @@ -3589,7 +3590,7 @@ }, { "cell_type": "markdown", - "id": "a53b04e9", + "id": "c9f79aa0", "metadata": {}, "source": [ "Verify the result." @@ -3598,7 +3599,7 @@ { "cell_type": "code", "execution_count": 95, - "id": "07475373", + "id": "af467db2", "metadata": {}, "outputs": [ { @@ -3736,7 +3737,7 @@ }, { "cell_type": "markdown", - "id": "24e8b173", + "id": "3eee4872", "metadata": {}, "source": [ "Close the CPU Dask Cluster" @@ -3745,7 +3746,7 @@ { "cell_type": "code", "execution_count": 96, - "id": "12508f5e", + "id": "431a0829", "metadata": {}, "outputs": [], "source": [ @@ -3756,7 +3757,7 @@ { "cell_type": "code", "execution_count": null, - "id": "83e4aed1", + "id": "239d32ed", "metadata": {}, "outputs": [], "source": []