diff --git a/tutorials/pretraining-data-curation/helper.py b/tutorials/pretraining-data-curation/helper.py
index 2f7af950..da0e9bde 100644
--- a/tutorials/pretraining-data-curation/helper.py
+++ b/tutorials/pretraining-data-curation/helper.py
@@ -52,3 +52,15 @@ def convert_str_id_to_int(df, id_column="id"):
df["doc_id"] = dx[1].astype("int64").values
df["dataset_id"] = dx[0].hash_values()
return df
+
+
+def get_dataframe_complement(original_df, filtered_df):
+ def partition_complement(part_original_df, partition_info=None):
+ if not partition_info:
+ return part_original_df
+ part_filtered_df = filtered_df.get_partition(partition_info["number"])
+ complement_mask = ~part_original_df.index.isin(part_filtered_df.index.persist())
+ complement_df = part_original_df[complement_mask]
+ return complement_df
+
+ return original_df.map_partitions(partition_complement)
diff --git a/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb b/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb
index 0d699925..2d1bfcb6 100644
--- a/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb
+++ b/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb
@@ -823,7 +823,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 3,
"id": "6532bd31-af79-4d1e-bfb3-5bf432f55ae5",
"metadata": {
"tags": []
@@ -939,7 +939,7 @@
{
"cell_type": "code",
"execution_count": 10,
- "id": "8777755c-002c-4e37-ad51-ca515ca53b4b",
+ "id": "9ef1205a-981f-48d1-8e2e-53762e33a0da",
"metadata": {
"tags": []
},
@@ -972,28 +972,28 @@
"
\n",
" \n",
" 0 | \n",
- " rpv2-2023-06-1594900690 | \n",
- " b31e61ba8cb85680f7acea426b9848fe | \n",
+ " rpv2-2023-06-0543500671 | \n",
+ " 5bb014b8aca49d2d2a46925b63c09f7f | \n",
"
\n",
" \n",
" 1 | \n",
- " rpv2-2023-06-1004500292 | \n",
- " 72bb25bef9420164ac8bc86a2ae340ef | \n",
+ " rpv2-2023-06-1721200315 | \n",
+ " 0dba141f62e01ffedde20dd6bf28df50 | \n",
"
\n",
" \n",
" 2 | \n",
- " rpv2-2023-06-2727300658 | \n",
- " 0c5834608662294d3dfa64de71850448 | \n",
+ " rpv2-2023-06-1989800099 | \n",
+ " 1e33a4ffce3154c8275ed09ff8049e1a | \n",
"
\n",
" \n",
" 3 | \n",
- " rpv2-2023-06-1642700934 | \n",
- " 1a247f38a86b32e0a6162f892c80a198 | \n",
+ " rpv2-2023-06-2578700629 | \n",
+ " 11608d5ffe62efb623abdcb813f0827a | \n",
"
\n",
" \n",
" 4 | \n",
- " rpv2-2023-06-0206000016 | \n",
- " 61a74bf725e1ba23c530a1e8fc71d554 | \n",
+ " rpv2-2023-06-3538600607 | \n",
+ " cb72ac618d7a6e60cf7d012c6be82672 | \n",
"
\n",
" \n",
"\n",
@@ -1001,11 +1001,11 @@
],
"text/plain": [
" id _hashes\n",
- "0 rpv2-2023-06-1594900690 b31e61ba8cb85680f7acea426b9848fe\n",
- "1 rpv2-2023-06-1004500292 72bb25bef9420164ac8bc86a2ae340ef\n",
- "2 rpv2-2023-06-2727300658 0c5834608662294d3dfa64de71850448\n",
- "3 rpv2-2023-06-1642700934 1a247f38a86b32e0a6162f892c80a198\n",
- "4 rpv2-2023-06-0206000016 61a74bf725e1ba23c530a1e8fc71d554"
+ "0 rpv2-2023-06-0543500671 5bb014b8aca49d2d2a46925b63c09f7f\n",
+ "1 rpv2-2023-06-1721200315 0dba141f62e01ffedde20dd6bf28df50\n",
+ "2 rpv2-2023-06-1989800099 1e33a4ffce3154c8275ed09ff8049e1a\n",
+ "3 rpv2-2023-06-2578700629 11608d5ffe62efb623abdcb813f0827a\n",
+ "4 rpv2-2023-06-3538600607 cb72ac618d7a6e60cf7d012c6be82672"
]
},
"execution_count": 10,
@@ -1020,8 +1020,8 @@
},
{
"cell_type": "code",
- "execution_count": 11,
- "id": "1f6d2165-6361-4d5e-9fca-c8de322c6ee1",
+ "execution_count": 17,
+ "id": "4ac54290-dc1d-4f60-b768-f162d338ca47",
"metadata": {
"tags": []
},
@@ -1047,7 +1047,7 @@
" \n",
" \n",
- " 1036 | \n",
- " rpv2-2023-06-2771406540 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 1052 | \n",
- " rpv2-2023-06-2443106203 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 1063 | \n",
- " rpv2-2023-06-0509306409 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 1364 | \n",
- " rpv2-2023-06-3063906432 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 1490 | \n",
- " rpv2-2023-06-2753207260 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 1507 | \n",
- " rpv2-2023-06-3001307073 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 1538 | \n",
- " rpv2-2023-06-0719006978 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 1551 | \n",
- " rpv2-2023-06-0700107191 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 4793 | \n",
- " rpv2-2023-06-0106826500 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 4794 | \n",
- " rpv2-2023-06-0117726560 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 4803 | \n",
- " rpv2-2023-06-0308126537 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 4850 | \n",
- " rpv2-2023-06-0271426680 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 4895 | \n",
- " rpv2-2023-06-2660226720 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 4920 | \n",
- " rpv2-2023-06-3275426459 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 4961 | \n",
- " rpv2-2023-06-1931826642 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 4998 | \n",
- " rpv2-2023-06-0709426436 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 5052 | \n",
- " rpv2-2023-06-0554826445 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 5068 | \n",
- " rpv2-2023-06-1829326568 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 5458 | \n",
- " rpv2-2023-06-1253629896 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 5466 | \n",
- " rpv2-2023-06-0700430073 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 6891 | \n",
- " rpv2-2023-06-1160137048 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 6900 | \n",
- " rpv2-2023-06-1139736733 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
- "
\n",
- " \n",
- " 6927 | \n",
- " rpv2-2023-06-3238236860 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
+ " 1 | \n",
+ " rpv2-2023-06-0962900660 | \n",
+ " b7ba44a047ca570585d182d28d1e6bf8 | \n",
"
\n",
" \n",
- " 6995 | \n",
- " rpv2-2023-06-1456836789 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
+ " 5 | \n",
+ " rpv2-2023-06-2417100276 | \n",
+ " b7ba44a047ca570585d182d28d1e6bf8 | \n",
"
\n",
" \n",
- " 7001 | \n",
- " rpv2-2023-06-1752936920 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
+ " 8 | \n",
+ " rpv2-2023-06-2936200328 | \n",
+ " b7ba44a047ca570585d182d28d1e6bf8 | \n",
"
\n",
" \n",
- " 7049 | \n",
- " rpv2-2023-06-0386936976 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
+ " 9 | \n",
+ " rpv2-2023-06-1423100927 | \n",
+ " b7ba44a047ca570585d182d28d1e6bf8 | \n",
"
\n",
" \n",
- " 7144 | \n",
- " rpv2-2023-06-1540136826 | \n",
- " e05b1c37967e7f4eec2392bd6e65b668 | \n",
+ " 16 | \n",
+ " rpv2-2023-06-2499600613 | \n",
+ " b7ba44a047ca570585d182d28d1e6bf8 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " id _hashes\n",
- "1036 rpv2-2023-06-2771406540 e05b1c37967e7f4eec2392bd6e65b668\n",
- "1052 rpv2-2023-06-2443106203 e05b1c37967e7f4eec2392bd6e65b668\n",
- "1063 rpv2-2023-06-0509306409 e05b1c37967e7f4eec2392bd6e65b668\n",
- "1364 rpv2-2023-06-3063906432 e05b1c37967e7f4eec2392bd6e65b668\n",
- "1490 rpv2-2023-06-2753207260 e05b1c37967e7f4eec2392bd6e65b668\n",
- "1507 rpv2-2023-06-3001307073 e05b1c37967e7f4eec2392bd6e65b668\n",
- "1538 rpv2-2023-06-0719006978 e05b1c37967e7f4eec2392bd6e65b668\n",
- "1551 rpv2-2023-06-0700107191 e05b1c37967e7f4eec2392bd6e65b668\n",
- "4793 rpv2-2023-06-0106826500 e05b1c37967e7f4eec2392bd6e65b668\n",
- "4794 rpv2-2023-06-0117726560 e05b1c37967e7f4eec2392bd6e65b668\n",
- "4803 rpv2-2023-06-0308126537 e05b1c37967e7f4eec2392bd6e65b668\n",
- "4850 rpv2-2023-06-0271426680 e05b1c37967e7f4eec2392bd6e65b668\n",
- "4895 rpv2-2023-06-2660226720 e05b1c37967e7f4eec2392bd6e65b668\n",
- "4920 rpv2-2023-06-3275426459 e05b1c37967e7f4eec2392bd6e65b668\n",
- "4961 rpv2-2023-06-1931826642 e05b1c37967e7f4eec2392bd6e65b668\n",
- "4998 rpv2-2023-06-0709426436 e05b1c37967e7f4eec2392bd6e65b668\n",
- "5052 rpv2-2023-06-0554826445 e05b1c37967e7f4eec2392bd6e65b668\n",
- "5068 rpv2-2023-06-1829326568 e05b1c37967e7f4eec2392bd6e65b668\n",
- "5458 rpv2-2023-06-1253629896 e05b1c37967e7f4eec2392bd6e65b668\n",
- "5466 rpv2-2023-06-0700430073 e05b1c37967e7f4eec2392bd6e65b668\n",
- "6891 rpv2-2023-06-1160137048 e05b1c37967e7f4eec2392bd6e65b668\n",
- "6900 rpv2-2023-06-1139736733 e05b1c37967e7f4eec2392bd6e65b668\n",
- "6927 rpv2-2023-06-3238236860 e05b1c37967e7f4eec2392bd6e65b668\n",
- "6995 rpv2-2023-06-1456836789 e05b1c37967e7f4eec2392bd6e65b668\n",
- "7001 rpv2-2023-06-1752936920 e05b1c37967e7f4eec2392bd6e65b668\n",
- "7049 rpv2-2023-06-0386936976 e05b1c37967e7f4eec2392bd6e65b668\n",
- "7144 rpv2-2023-06-1540136826 e05b1c37967e7f4eec2392bd6e65b668"
+ " id _hashes\n",
+ "1 rpv2-2023-06-0962900660 b7ba44a047ca570585d182d28d1e6bf8\n",
+ "5 rpv2-2023-06-2417100276 b7ba44a047ca570585d182d28d1e6bf8\n",
+ "8 rpv2-2023-06-2936200328 b7ba44a047ca570585d182d28d1e6bf8\n",
+ "9 rpv2-2023-06-1423100927 b7ba44a047ca570585d182d28d1e6bf8\n",
+ "16 rpv2-2023-06-2499600613 b7ba44a047ca570585d182d28d1e6bf8"
]
},
- "execution_count": 12,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "duplicates_df[duplicates_df['_hashes'] == 'e05b1c37967e7f4eec2392bd6e65b668'].compute()"
+ "dup_group = duplicates_df[duplicates_df['_hashes'] == 'b7ba44a047ca570585d182d28d1e6bf8'].compute()\n",
+ "dup_group.head()"
]
},
{
@@ -1316,13 +1189,13 @@
"id": "5cc83333-b19b-4335-92ef-6bcc29f3d7bf",
"metadata": {},
"source": [
- "Let's verify if the documents with the same hash are exactly the same:"
+ "[Optional] Verify if the documents with the same hash are exactly the same. We can use the ids from the cell output above (ids may change so revise the `dup_ids` as needed):"
]
},
{
"cell_type": "code",
- "execution_count": 13,
- "id": "ccb9d8a6-f9ca-47a3-9736-7476b6faf86a",
+ "execution_count": 16,
+ "id": "ab1a6018-dead-4d22-b496-87b5afe56e7a",
"metadata": {
"tags": []
},
@@ -1331,63 +1204,41 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Searching one duplicate took:661.8512754440308\n"
+ "Searching for example duplicates with specific IDs took 631.4109137058258 seconds\n"
]
}
],
"source": [
"t0 = time.time()\n",
- "dup_ex1 = input_dataset.df[input_dataset.df['id'] == 'rpv2-2023-06-2771406540'].compute()\n",
- "print(f\"Searching one duplicate took:{time.time()-t0}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e19b57ba-faf3-4a7e-9e1a-4baaf542e206",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "dup_ex1"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "41f178ca-5f6a-441a-ad97-a51cfd13b921",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "print(dup_ex1.raw_content.iloc[0])"
+ "dup_ids = ['rpv2-2023-06-0962900660', 'rpv2-2023-06-2417100276', 'rpv2-2023-06-2936200328'] \n",
+ "dup_examples = input_dataset.df[input_dataset.df['id'].isin(dup_ids)].compute()\n",
+ "print(f\"Searching for example duplicates with specific IDs took {time.time()-t0} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "5a967ae1-dc8b-4abc-9298-aef75be46cb4",
+ "id": "a62c96e2-cb2e-40ac-9f94-5aedb32e91c0",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
- "dup_ex2 = input_dataset.df[input_dataset.df['id'] == 'rpv2-2023-06-2443106203'].compute()\n",
- "print(dup_ex2.raw_content.iloc[0])"
+ "dup_examples"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "88b3a6eb-2f6d-491c-9fd3-d2a0e12394c8",
+ "id": "9876a2e1-ba4e-43a9-9cfe-5035c6e98ab2",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
- "dup_ex2"
+ "print('Example duplicate 1\\n' + dup_examples.raw_content.iloc[0])\n",
+ "print('\\n\\nExample duplicate 2\\n' + dup_examples.raw_content.iloc[1])\n",
+ "print('\\n\\nExample duplicate 3\\n' + dup_examples.raw_content.iloc[2])"
]
},
{
@@ -1538,7 +1389,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 2,
"id": "750b1c02-2b37-474f-aaa2-2de86ac3a9e7",
"metadata": {
"tags": []
@@ -2906,17 +2757,15 @@
},
{
"cell_type": "code",
- "execution_count": 3,
- "id": "5a1d5697-7504-4e2c-9808-169eccdcd3af",
- "metadata": {
- "tags": []
- },
+ "execution_count": 5,
+ "id": "94e8126d-af15-4182-98cd-10df06e9778e",
+ "metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "docs_to_remove 239037733\n"
+ "num of docs to remove = 239037733\n"
]
}
],
@@ -2946,7 +2795,7 @@
"_ = wait(docs_to_remove)\n",
"del _ \n",
"\n",
- "print(\"docs_to_remove\", len(docs_to_remove))"
+ "print(\"num of docs to remove =\", len(docs_to_remove))"
]
},
{
@@ -2954,13 +2803,13 @@
"id": "568ee0b5-f2dd-4d34-917f-56f4211a36fe",
"metadata": {},
"source": [
- "We can examine some example duplicates."
+ "We can examine the size of the duplicate clusters. The largest cluster has 775,379 near duplicates."
]
},
{
"cell_type": "code",
- "execution_count": 4,
- "id": "36c80cc4-44ee-43cc-bdd3-89e456c23469",
+ "execution_count": 7,
+ "id": "cae7f166-836a-4c21-bff2-7453254956b7",
"metadata": {
"tags": []
},
@@ -2986,7 +2835,7 @@
"