From ea271b31e6a87c5b9b23491e2b2f937356c97b6a Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Fri, 4 Oct 2024 02:11:30 -0500 Subject: [PATCH] sort duplicate cluster by size Signed-off-by: Yang Yu --- tutorials/pretraining-data-curation/helper.py | 12 + .../red-pajama-v2-curation-tutorial.ipynb | 471 ++++++------------ 2 files changed, 167 insertions(+), 316 deletions(-) diff --git a/tutorials/pretraining-data-curation/helper.py b/tutorials/pretraining-data-curation/helper.py index 2f7af9508..da0e9bdee 100644 --- a/tutorials/pretraining-data-curation/helper.py +++ b/tutorials/pretraining-data-curation/helper.py @@ -52,3 +52,15 @@ def convert_str_id_to_int(df, id_column="id"): df["doc_id"] = dx[1].astype("int64").values df["dataset_id"] = dx[0].hash_values() return df + + +def get_dataframe_complement(original_df, filtered_df): + def partition_complement(part_original_df, partition_info=None): + if not partition_info: + return part_original_df + part_filtered_df = filtered_df.get_partition(partition_info["number"]) + complement_mask = ~part_original_df.index.isin(part_filtered_df.index.persist()) + complement_df = part_original_df[complement_mask] + return complement_df + + return original_df.map_partitions(partition_complement) diff --git a/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb b/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb index 0d6999254..2d1bfcb68 100644 --- a/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb +++ b/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb @@ -823,7 +823,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "6532bd31-af79-4d1e-bfb3-5bf432f55ae5", "metadata": { "tags": [] @@ -939,7 +939,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "8777755c-002c-4e37-ad51-ca515ca53b4b", + "id": "9ef1205a-981f-48d1-8e2e-53762e33a0da", "metadata": { "tags": [] }, @@ -972,28 +972,28 @@ " \n", " \n", " 0\n", - " rpv2-2023-06-1594900690\n", - " b31e61ba8cb85680f7acea426b9848fe\n", + " rpv2-2023-06-0543500671\n", + " 5bb014b8aca49d2d2a46925b63c09f7f\n", " \n", " \n", " 1\n", - " rpv2-2023-06-1004500292\n", - " 72bb25bef9420164ac8bc86a2ae340ef\n", + " rpv2-2023-06-1721200315\n", + " 0dba141f62e01ffedde20dd6bf28df50\n", " \n", " \n", " 2\n", - " rpv2-2023-06-2727300658\n", - " 0c5834608662294d3dfa64de71850448\n", + " rpv2-2023-06-1989800099\n", + " 1e33a4ffce3154c8275ed09ff8049e1a\n", " \n", " \n", " 3\n", - " rpv2-2023-06-1642700934\n", - " 1a247f38a86b32e0a6162f892c80a198\n", + " rpv2-2023-06-2578700629\n", + " 11608d5ffe62efb623abdcb813f0827a\n", " \n", " \n", " 4\n", - " rpv2-2023-06-0206000016\n", - " 61a74bf725e1ba23c530a1e8fc71d554\n", + " rpv2-2023-06-3538600607\n", + " cb72ac618d7a6e60cf7d012c6be82672\n", " \n", " \n", "\n", @@ -1001,11 +1001,11 @@ ], "text/plain": [ " id _hashes\n", - "0 rpv2-2023-06-1594900690 b31e61ba8cb85680f7acea426b9848fe\n", - "1 rpv2-2023-06-1004500292 72bb25bef9420164ac8bc86a2ae340ef\n", - "2 rpv2-2023-06-2727300658 0c5834608662294d3dfa64de71850448\n", - "3 rpv2-2023-06-1642700934 1a247f38a86b32e0a6162f892c80a198\n", - "4 rpv2-2023-06-0206000016 61a74bf725e1ba23c530a1e8fc71d554" + "0 rpv2-2023-06-0543500671 5bb014b8aca49d2d2a46925b63c09f7f\n", + "1 rpv2-2023-06-1721200315 0dba141f62e01ffedde20dd6bf28df50\n", + "2 rpv2-2023-06-1989800099 1e33a4ffce3154c8275ed09ff8049e1a\n", + "3 rpv2-2023-06-2578700629 11608d5ffe62efb623abdcb813f0827a\n", + "4 rpv2-2023-06-3538600607 cb72ac618d7a6e60cf7d012c6be82672" ] }, "execution_count": 10, @@ -1020,8 +1020,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "id": "1f6d2165-6361-4d5e-9fca-c8de322c6ee1", + "execution_count": 17, + "id": "4ac54290-dc1d-4f60-b768-f162d338ca47", "metadata": { "tags": [] }, @@ -1047,7 +1047,7 @@ " \n", " \n", " \n", - " id\n", + " count\n", " \n", " \n", " _hashes\n", @@ -1056,52 +1056,56 @@ " \n", " \n", " \n", - " 7a724e20912f26144d90dbf74c6fe0ae\n", - " 2\n", + " b7ba44a047ca570585d182d28d1e6bf8\n", + " 1819\n", " \n", " \n", - " fab1be64dd1d1a20ec5e3a77b962a3e8\n", - " 2\n", + " 0469bde3868757d92af369c59992b9d9\n", + " 1785\n", " \n", " \n", - " 0d039804d82f3a375e19ca9cbb3d830a\n", - " 2\n", + " bdc1e82cba718a4717c683bf6a5541bd\n", + " 1784\n", " \n", " \n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " 27\n", + " f14149344e6519beaac2590b0535d267\n", + " 1771\n", " \n", " \n", - " bb3f77234cb015a2c24710d22c0bfc57\n", - " 2\n", + " f88eb7064d8e73c081af0731ba73c451\n", + " 1765\n", " \n", " \n", "\n", "" ], "text/plain": [ - " id\n", - "_hashes \n", - "7a724e20912f26144d90dbf74c6fe0ae 2\n", - "fab1be64dd1d1a20ec5e3a77b962a3e8 2\n", - "0d039804d82f3a375e19ca9cbb3d830a 2\n", - "e05b1c37967e7f4eec2392bd6e65b668 27\n", - "bb3f77234cb015a2c24710d22c0bfc57 2" + " count\n", + "_hashes \n", + "b7ba44a047ca570585d182d28d1e6bf8 1819\n", + "0469bde3868757d92af369c59992b9d9 1785\n", + "bdc1e82cba718a4717c683bf6a5541bd 1784\n", + "f14149344e6519beaac2590b0535d267 1771\n", + "f88eb7064d8e73c081af0731ba73c451 1765" ] }, - "execution_count": 11, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "duplicates_df.groupby('_hashes').agg({'id': 'count'}).head()" + "duplicates_df.groupby('_hashes') \\\n", + " .agg({'id': 'count'}) \\\n", + " .rename(columns={'id': 'count'}) \\\n", + " .sort_values('count', ascending=False) \\\n", + " .head()" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "45a965fa-c2c1-4afa-bee8-9eb48945ee3b", + "execution_count": 13, + "id": "8b38f5a6-6f48-4081-a717-fb9a1b5e9539", "metadata": { "tags": [] }, @@ -1133,182 +1137,51 @@ " \n", " \n", " \n", - " 1036\n", - " rpv2-2023-06-2771406540\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 1052\n", - " rpv2-2023-06-2443106203\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 1063\n", - " rpv2-2023-06-0509306409\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 1364\n", - " rpv2-2023-06-3063906432\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 1490\n", - " rpv2-2023-06-2753207260\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 1507\n", - " rpv2-2023-06-3001307073\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 1538\n", - " rpv2-2023-06-0719006978\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 1551\n", - " rpv2-2023-06-0700107191\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 4793\n", - " rpv2-2023-06-0106826500\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 4794\n", - " rpv2-2023-06-0117726560\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 4803\n", - " rpv2-2023-06-0308126537\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 4850\n", - " rpv2-2023-06-0271426680\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 4895\n", - " rpv2-2023-06-2660226720\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 4920\n", - " rpv2-2023-06-3275426459\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 4961\n", - " rpv2-2023-06-1931826642\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 4998\n", - " rpv2-2023-06-0709426436\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 5052\n", - " rpv2-2023-06-0554826445\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 5068\n", - " rpv2-2023-06-1829326568\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 5458\n", - " rpv2-2023-06-1253629896\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 5466\n", - " rpv2-2023-06-0700430073\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 6891\n", - " rpv2-2023-06-1160137048\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 6900\n", - " rpv2-2023-06-1139736733\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", - " \n", - " \n", - " 6927\n", - " rpv2-2023-06-3238236860\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", + " 1\n", + " rpv2-2023-06-0962900660\n", + " b7ba44a047ca570585d182d28d1e6bf8\n", " \n", " \n", - " 6995\n", - " rpv2-2023-06-1456836789\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", + " 5\n", + " rpv2-2023-06-2417100276\n", + " b7ba44a047ca570585d182d28d1e6bf8\n", " \n", " \n", - " 7001\n", - " rpv2-2023-06-1752936920\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", + " 8\n", + " rpv2-2023-06-2936200328\n", + " b7ba44a047ca570585d182d28d1e6bf8\n", " \n", " \n", - " 7049\n", - " rpv2-2023-06-0386936976\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", + " 9\n", + " rpv2-2023-06-1423100927\n", + " b7ba44a047ca570585d182d28d1e6bf8\n", " \n", " \n", - " 7144\n", - " rpv2-2023-06-1540136826\n", - " e05b1c37967e7f4eec2392bd6e65b668\n", + " 16\n", + " rpv2-2023-06-2499600613\n", + " b7ba44a047ca570585d182d28d1e6bf8\n", " \n", " \n", "\n", "" ], "text/plain": [ - " id _hashes\n", - "1036 rpv2-2023-06-2771406540 e05b1c37967e7f4eec2392bd6e65b668\n", - "1052 rpv2-2023-06-2443106203 e05b1c37967e7f4eec2392bd6e65b668\n", - "1063 rpv2-2023-06-0509306409 e05b1c37967e7f4eec2392bd6e65b668\n", - "1364 rpv2-2023-06-3063906432 e05b1c37967e7f4eec2392bd6e65b668\n", - "1490 rpv2-2023-06-2753207260 e05b1c37967e7f4eec2392bd6e65b668\n", - "1507 rpv2-2023-06-3001307073 e05b1c37967e7f4eec2392bd6e65b668\n", - "1538 rpv2-2023-06-0719006978 e05b1c37967e7f4eec2392bd6e65b668\n", - "1551 rpv2-2023-06-0700107191 e05b1c37967e7f4eec2392bd6e65b668\n", - "4793 rpv2-2023-06-0106826500 e05b1c37967e7f4eec2392bd6e65b668\n", - "4794 rpv2-2023-06-0117726560 e05b1c37967e7f4eec2392bd6e65b668\n", - "4803 rpv2-2023-06-0308126537 e05b1c37967e7f4eec2392bd6e65b668\n", - "4850 rpv2-2023-06-0271426680 e05b1c37967e7f4eec2392bd6e65b668\n", - "4895 rpv2-2023-06-2660226720 e05b1c37967e7f4eec2392bd6e65b668\n", - "4920 rpv2-2023-06-3275426459 e05b1c37967e7f4eec2392bd6e65b668\n", - "4961 rpv2-2023-06-1931826642 e05b1c37967e7f4eec2392bd6e65b668\n", - "4998 rpv2-2023-06-0709426436 e05b1c37967e7f4eec2392bd6e65b668\n", - "5052 rpv2-2023-06-0554826445 e05b1c37967e7f4eec2392bd6e65b668\n", - "5068 rpv2-2023-06-1829326568 e05b1c37967e7f4eec2392bd6e65b668\n", - "5458 rpv2-2023-06-1253629896 e05b1c37967e7f4eec2392bd6e65b668\n", - "5466 rpv2-2023-06-0700430073 e05b1c37967e7f4eec2392bd6e65b668\n", - "6891 rpv2-2023-06-1160137048 e05b1c37967e7f4eec2392bd6e65b668\n", - "6900 rpv2-2023-06-1139736733 e05b1c37967e7f4eec2392bd6e65b668\n", - "6927 rpv2-2023-06-3238236860 e05b1c37967e7f4eec2392bd6e65b668\n", - "6995 rpv2-2023-06-1456836789 e05b1c37967e7f4eec2392bd6e65b668\n", - "7001 rpv2-2023-06-1752936920 e05b1c37967e7f4eec2392bd6e65b668\n", - "7049 rpv2-2023-06-0386936976 e05b1c37967e7f4eec2392bd6e65b668\n", - "7144 rpv2-2023-06-1540136826 e05b1c37967e7f4eec2392bd6e65b668" + " id _hashes\n", + "1 rpv2-2023-06-0962900660 b7ba44a047ca570585d182d28d1e6bf8\n", + "5 rpv2-2023-06-2417100276 b7ba44a047ca570585d182d28d1e6bf8\n", + "8 rpv2-2023-06-2936200328 b7ba44a047ca570585d182d28d1e6bf8\n", + "9 rpv2-2023-06-1423100927 b7ba44a047ca570585d182d28d1e6bf8\n", + "16 rpv2-2023-06-2499600613 b7ba44a047ca570585d182d28d1e6bf8" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "duplicates_df[duplicates_df['_hashes'] == 'e05b1c37967e7f4eec2392bd6e65b668'].compute()" + "dup_group = duplicates_df[duplicates_df['_hashes'] == 'b7ba44a047ca570585d182d28d1e6bf8'].compute()\n", + "dup_group.head()" ] }, { @@ -1316,13 +1189,13 @@ "id": "5cc83333-b19b-4335-92ef-6bcc29f3d7bf", "metadata": {}, "source": [ - "Let's verify if the documents with the same hash are exactly the same:" + "[Optional] Verify if the documents with the same hash are exactly the same. We can use the ids from the cell output above (ids may change so revise the `dup_ids` as needed):" ] }, { "cell_type": "code", - "execution_count": 13, - "id": "ccb9d8a6-f9ca-47a3-9736-7476b6faf86a", + "execution_count": 16, + "id": "ab1a6018-dead-4d22-b496-87b5afe56e7a", "metadata": { "tags": [] }, @@ -1331,63 +1204,41 @@ "name": "stdout", "output_type": "stream", "text": [ - "Searching one duplicate took:661.8512754440308\n" + "Searching for example duplicates with specific IDs took 631.4109137058258 seconds\n" ] } ], "source": [ "t0 = time.time()\n", - "dup_ex1 = input_dataset.df[input_dataset.df['id'] == 'rpv2-2023-06-2771406540'].compute()\n", - "print(f\"Searching one duplicate took:{time.time()-t0}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e19b57ba-faf3-4a7e-9e1a-4baaf542e206", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "dup_ex1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "41f178ca-5f6a-441a-ad97-a51cfd13b921", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print(dup_ex1.raw_content.iloc[0])" + "dup_ids = ['rpv2-2023-06-0962900660', 'rpv2-2023-06-2417100276', 'rpv2-2023-06-2936200328'] \n", + "dup_examples = input_dataset.df[input_dataset.df['id'].isin(dup_ids)].compute()\n", + "print(f\"Searching for example duplicates with specific IDs took {time.time()-t0} seconds\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "5a967ae1-dc8b-4abc-9298-aef75be46cb4", + "id": "a62c96e2-cb2e-40ac-9f94-5aedb32e91c0", "metadata": { "tags": [] }, "outputs": [], "source": [ - "dup_ex2 = input_dataset.df[input_dataset.df['id'] == 'rpv2-2023-06-2443106203'].compute()\n", - "print(dup_ex2.raw_content.iloc[0])" + "dup_examples" ] }, { "cell_type": "code", "execution_count": null, - "id": "88b3a6eb-2f6d-491c-9fd3-d2a0e12394c8", + "id": "9876a2e1-ba4e-43a9-9cfe-5035c6e98ab2", "metadata": { "tags": [] }, "outputs": [], "source": [ - "dup_ex2" + "print('Example duplicate 1\\n' + dup_examples.raw_content.iloc[0])\n", + "print('\\n\\nExample duplicate 2\\n' + dup_examples.raw_content.iloc[1])\n", + "print('\\n\\nExample duplicate 3\\n' + dup_examples.raw_content.iloc[2])" ] }, { @@ -1538,7 +1389,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "750b1c02-2b37-474f-aaa2-2de86ac3a9e7", "metadata": { "tags": [] @@ -2906,17 +2757,15 @@ }, { "cell_type": "code", - "execution_count": 3, - "id": "5a1d5697-7504-4e2c-9808-169eccdcd3af", - "metadata": { - "tags": [] - }, + "execution_count": 5, + "id": "94e8126d-af15-4182-98cd-10df06e9778e", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "docs_to_remove 239037733\n" + "num of docs to remove = 239037733\n" ] } ], @@ -2946,7 +2795,7 @@ "_ = wait(docs_to_remove)\n", "del _ \n", "\n", - "print(\"docs_to_remove\", len(docs_to_remove))" + "print(\"num of docs to remove =\", len(docs_to_remove))" ] }, { @@ -2954,13 +2803,13 @@ "id": "568ee0b5-f2dd-4d34-917f-56f4211a36fe", "metadata": {}, "source": [ - "We can examine some example duplicates." + "We can examine the size of the duplicate clusters. The largest cluster has 775,379 near duplicates." ] }, { "cell_type": "code", - "execution_count": 4, - "id": "36c80cc4-44ee-43cc-bdd3-89e456c23469", + "execution_count": 7, + "id": "cae7f166-836a-4c21-bff2-7453254956b7", "metadata": { "tags": [] }, @@ -2986,7 +2835,7 @@ " \n", " \n", " \n", - " id\n", + " count\n", " \n", " \n", " group\n", @@ -2995,46 +2844,46 @@ " \n", " \n", " \n", - " 123501402\n", - " 27\n", + " 350652173\n", + " 775379\n", " \n", " \n", - " 83259859\n", - " 2\n", + " 93521324\n", + " 493227\n", " \n", " \n", - " 266079136\n", - " 3\n", + " 24\n", + " 112861\n", " \n", " \n", - " 119886209\n", - " 6888\n", + " 319292355\n", + " 96224\n", " \n", " \n", - " 221343674\n", - " 21\n", + " 70141069\n", + " 67474\n", " \n", " \n", "\n", "" ], "text/plain": [ - " id\n", - "group \n", - "123501402 27\n", - "83259859 2\n", - "266079136 3\n", - "119886209 6888\n", - "221343674 21" + " count\n", + "group \n", + "350652173 775379\n", + "93521324 493227\n", + "24 112861\n", + "319292355 96224\n", + "70141069 67474" ] }, - "execution_count": 4, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "cc_grouped = cc_result.groupby('group').agg({'id': 'count'})\n", + "cc_grouped = cc_result.groupby('group').agg({'doc_id': 'count'}).rename(columns={'doc_id': 'count'}).sort_values('count', ascending=False).compute()\n", "cc_grouped.head()" ] }, @@ -3043,13 +2892,13 @@ "id": "0def7323-3d2c-4861-9b7e-a1e296ccf329", "metadata": {}, "source": [ - "For example, let's look into group \"119886209\"." + "[Optional] Verify if fuzzy duplicates are similar. For example, we can look into the largest group \"350652173\"." ] }, { "cell_type": "code", - "execution_count": 5, - "id": "2fa0cdab-cd5d-445b-bd9a-af588cf2df14", + "execution_count": 8, + "id": "e22cb491-c2ab-4ec4-8313-ae2bcd66a352", "metadata": { "tags": [] }, @@ -3077,66 +2926,60 @@ " \n", " dataset_id\n", " doc_id\n", + " \n", + " \n", " group\n", - " id\n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 3\n", + " 350652173\n", " 256213913\n", - " 469622202\n", - " 119886209\n", - " rpv2-2023-06-0469622202\n", + " 1285625132\n", " \n", " \n", - " 37437\n", + " 350652173\n", " 256213913\n", - " 501608788\n", - " 119886209\n", - " rpv2-2023-06-0501608788\n", + " 2033200488\n", " \n", " \n", - " 60404\n", + " 350652173\n", " 256213913\n", - " 2341629062\n", - " 119886209\n", - " rpv2-2023-06-2341629062\n", + " 428016172\n", " \n", " \n", - " 81405\n", + " 350652173\n", " 256213913\n", - " 1511229746\n", - " 119886209\n", - " rpv2-2023-06-1511229746\n", + " 1268721963\n", " \n", " \n", - " 148765\n", + " 350652173\n", " 256213913\n", - " 2369426855\n", - " 119886209\n", - " rpv2-2023-06-2369426855\n", + " 1285428574\n", " \n", " \n", "\n", "" ], "text/plain": [ - " dataset_id doc_id group id\n", - "3 256213913 469622202 119886209 rpv2-2023-06-0469622202\n", - "37437 256213913 501608788 119886209 rpv2-2023-06-0501608788\n", - "60404 256213913 2341629062 119886209 rpv2-2023-06-2341629062\n", - "81405 256213913 1511229746 119886209 rpv2-2023-06-1511229746\n", - "148765 256213913 2369426855 119886209 rpv2-2023-06-2369426855" + " dataset_id doc_id\n", + "group \n", + "350652173 256213913 1285625132\n", + "350652173 256213913 2033200488\n", + "350652173 256213913 428016172\n", + "350652173 256213913 1268721963\n", + "350652173 256213913 1285428574" ] }, - "execution_count": 5, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "dup_group = cc_result[cc_result['group'] == 119886209].compute()\n", + "dup_group = cc_result.loc[350652173].compute()\n", "dup_group.head()" ] }, @@ -3145,7 +2988,7 @@ "id": "170c1cf4-8cb9-4f10-aab3-acfdaa9e5b16", "metadata": {}, "source": [ - "We can examine the first five documents in this component:" + "We will examine the first five documents in this cluster:" ] }, { @@ -3175,13 +3018,13 @@ "id": "9772bf71-9e18-4e59-b9f8-ebd9053c79b0", "metadata": {}, "source": [ - "Let's visualize the content of these documents and see if they are similar." + "Let's visualize the content of these documents and see if they are similar (ids may change so revise the `dup_ids` as needed)." ] }, { "cell_type": "code", - "execution_count": 8, - "id": "ec6ad2da-c45a-4b66-9932-60b0d4978f61", + "execution_count": 10, + "id": "e3cc167f-30f8-470d-99e3-0a2d916d46bf", "metadata": { "tags": [] }, @@ -3190,15 +3033,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Finding near duplicates with specific IDs took 882.7411408424377 seconds\n" + "Searching for near duplicate examples with specific IDs took 610.5046670436859 seconds\n" ] } ], "source": [ "t0 = time.time()\n", - "dup_ids = ['rpv2-2023-06-0469622202', 'rpv2-2023-06-0501608788', 'rpv2-2023-06-2341629062','rpv2-2023-06-1511229746','rpv2-2023-06-2369426855'] \n", + "dup_ids = [\n", + " 'rpv2-2023-06-1285625132',\n", + " 'rpv2-2023-06-2033200488',\n", + " 'rpv2-2023-06-0428016172',\n", + " 'rpv2-2023-06-1268721963',\n", + " 'rpv2-2023-06-1285428574'\n", + "] \n", "dup_examples = input_dataset.df[input_dataset.df['id'].isin(dup_ids)].compute()\n", - "print(f\"Finding near duplicates with specific IDs took {time.time()-t0} seconds\")" + "print(f\"Searching for near duplicate examples with specific IDs took {time.time()-t0} seconds\")" ] }, { @@ -3224,8 +3073,9 @@ "source": [ "print('Example duplicate 1\\n' + dup_examples.raw_content.iloc[0])\n", "print('\\n\\nExample duplicate 2\\n' + dup_examples.raw_content.iloc[1])\n", - "print('\\n\\nExample duplicate 3\\n' + dup_examples.raw_content.iloc[1])\n", - "print('\\n\\nExample duplicate 4\\n' + dup_examples.raw_content.iloc[1])" + "print('\\n\\nExample duplicate 3\\n' + dup_examples.raw_content.iloc[2])\n", + "print('\\n\\nExample duplicate 4\\n' + dup_examples.raw_content.iloc[3])\n", + "print('\\n\\nExample duplicate 4\\n' + dup_examples.raw_content.iloc[4])" ] }, { @@ -4544,8 +4394,6 @@ }, "outputs": [], "source": [ - "\n", - "\n", "cache_dir = expand_outdir_and_mkdir(\n", " os.path.join(base_dir, \"fuzzy-dedup-output-2023-06-and-14/cc-cache\")\n", ")\n", @@ -4948,7 +4796,7 @@ "id": "1c92ae5e-7397-4ad9-9dec-cb93eefc3dde", "metadata": {}, "source": [ - "We can also examine some example low quality documents:" + "[Optional] Examine example low quality documents:" ] }, { @@ -4960,16 +4808,7 @@ }, "outputs": [], "source": [ - "def get_dataframe_complement(original_df, filtered_df):\n", - " def partition_complement(part_original_df, partition_info=None):\n", - " if not partition_info:\n", - " return part_original_df\n", - " part_filtered_df = filtered_df.get_partition(partition_info[\"number\"])\n", - " complement_mask = ~part_original_df.index.isin(part_filtered_df.index.persist())\n", - " complement_df = part_original_df[complement_mask]\n", - " return complement_df\n", - "\n", - " return original_df.map_partitions(partition_complement)\n", + "from helper import get_dataframe_complement\n", "\n", "original_df = dd.read_parquet(hf_input_data_dir)\n", "filtered_df = dd.read_parquet(kept_document_dir)\n",