Merge pull request #263 from Living-with-machines/260-change-hf-model…

…-paths Change HF base and ner model, and pipeline default NER
Living-with-machines · Jul 28, 2023 · 830dfe8 · 830dfe8
2 parents 7fae8f9 + 8b66651
commit 830dfe8
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 93 deletions.
diff --git a/examples/load_use_ner_model.ipynb b/examples/load_use_ner_model.ipynb
@@ -32,7 +32,7 @@
    "source": [
     "Create a `myner` object of the `Recogniser` class.\n",
     "\n",
-    "We only need to pass the path to the model in `model` and set `load_from_hum` to True, as follows:"
+    "We only need to pass the path to the model in `model` and set `load_from_hub` to True, as follows:"
    ]
   },
   {
@@ -42,21 +42,8 @@
    "outputs": [],
    "source": [
     "myner = recogniser.Recogniser(\n",
-    "    model=\"blb_lwm-ner-fine\",\n",
-    "    pipe=None,\n",
-    "    base_model=\"khosseini/bert_1760_1900\",\n",
-    "    train_dataset=\"../experiments/outputs/data/lwm/ner_fine_train.json\",\n",
-    "    test_dataset=\"../experiments/outputs/data/lwm/ner_fine_dev.json\",\n",
-    "    model_path=\"../resources/models/\",\n",
-    "    training_args={\n",
-    "        \"learning_rate\": 5e-5,\n",
-    "        \"batch_size\": 16,\n",
-    "        \"num_train_epochs\": 4,\n",
-    "        \"weight_decay\": 0.01,\n",
-    "    },\n",
-    "    overwrite_training=False,\n",
-    "    do_test=False,\n",
-    "    load_from_hub=False,\n",
+    "    model=\"Livingwithmachines/toponym-19thC-en\",\n",
+    "    load_from_hub=True,\n",
     ")"
    ]
   },
@@ -128,7 +115,7 @@
     "sentence = \"A remarkable case of rattening has just occurred in the building trade at Sheffield.\"\n",
     "\n",
     "predictions = myner.ner_predict(sentence)\n",
-    "print(predictions) # Note that, if you've trained the model in the test mode, the model will probably not identify \"Sheffield\" as a location."
+    "print(predictions)"
    ]
   }
  ],

diff --git a/examples/train_use_ner_model.ipynb b/examples/train_use_ner_model.ipynb
@@ -50,21 +50,21 @@
     "    train_dataset=\"../experiments/outputs/data/lwm/ner_fine_train.json\",  # Path to the json file containing the training set (see note above).\n",
     "    test_dataset=\"../experiments/outputs/data/lwm/ner_fine_dev.json\",  # Path to the json file containing the test set (see note above).\n",
     "    pipe=None,  # We'll store the NER pipeline here, leave this empty.\n",
-    "    base_model=\"khosseini/bert_1760_1900\",  # Base model to fine-tune for NER. The value can be: either \n",
+    "    base_model=\"Livingwithmachines/bert_1760_1900\",  # Base model to fine-tune for NER. The value can be: either \n",
     "                                            # your local path to a model or the huggingface path.\n",
     "                                            # In this case, we use the huggingface path:\n",
-    "                                            # https://huggingface.co/khosseini/bert_1760_1900). You can\n",
+    "                                            # https://huggingface.co/Livingwithmachines/bert_1760_1900). You can\n",
     "                                            # chose any other model from the HuggingFace hub, as long as it's\n",
     "                                            # trained on the \"Fill-Mask\" objective (filter by task).\n",
     "    model_path=\"../resources/models/\",  # Path where the NER model will be stored.\n",
     "    training_args={\n",
-    "        \"learning_rate\": 5e-5,\n",
-    "        \"batch_size\": 16,\n",
-    "        \"num_train_epochs\": 4,\n",
-    "        \"weight_decay\": 0.01,\n",
+    "        \"batch_size\": 8,\n",
+    "        \"num_train_epochs\": 10,\n",
+    "        \"learning_rate\": 0.00005,\n",
+    "        \"weight_decay\": 0.0,\n",
     "    }, # Training arguments: you can change them.\n",
     "    overwrite_training=False,  # Set to True if you want to overwrite an existing model with the same name.\n",
-    "    do_test=False,  # Set to True if you want to perform the training on test mode (the string \"_test\" will be appended to your model name).\n",
+    "    do_test=True,  # Set to True if you want to perform the training on test mode (the string \"_test\" will be appended to your model name).\n",
     "    load_from_hub=False, # Whether the model should be loaded from the HuggingFace hub\n",
     ")"
    ]

diff --git a/experiments/toponym_resolution.py b/experiments/toponym_resolution.py
@@ -59,10 +59,10 @@
         + granularity
         + "_dev.json",  # Path to the json file containing the test set (see note above).
         pipe=None,  # We'll store the NER pipeline here, leave this empty.
-        base_model="khosseini/bert_1760_1900",  # Base model to fine-tune for NER. The value can be: either
+        base_model="Livingwithmachines/bert_1760_1900",  # Base model to fine-tune for NER. The value can be: either
         # your local path to a model or the huggingface path.
         # In this case, we use the huggingface path:
-        # https://huggingface.co/khosseini/bert_1760_1900). You can
+        # https://huggingface.co/Livingwithmachines/bert_1760_1900). You can
         # chose any other model from the HuggingFace hub, as long as it's
         # trained on the "Fill-Mask" objective (filter by task).
         model_path="../resources/models/",  # Path where the NER model will be stored.

diff --git a/geoparser/pipeline.py b/geoparser/pipeline.py
@@ -47,21 +47,8 @@ class Pipeline:
           .. code-block:: python
 
             recogniser.Recogniser(
-                model="blb_lwm-ner-fine",
-                pipe=None,
-                base_model="khosseini/bert_1760_1900",
-                train_dataset="../experiments/outputs/data/lwm/ner_fine_train.json",
-                test_dataset="../experiments/outputs/data/lwm/ner_fine_dev.json",
-                model_path="../resources/models/",
-                training_args={
-                    "learning_rate": 5e-5,
-                    "batch_size": 16,
-                    "num_train_epochs": 4,
-                    "weight_decay": 0.01,
-                },
-                overwrite_training=False,
-                do_test=False,
-                load_from_hub=False,
+                model="Livingwithmachines/toponym-19thC-en",
+                load_from_hub=True,
             )
 
         * The default settings for the ``Ranker``:
@@ -99,23 +86,9 @@ def __init__(
 
         # If myner is None, instantiate the default Recogniser.
         if not self.myner:
-            dataset_path = "../experiments/outputs/data/lwm"
             self.myner = recogniser.Recogniser(
-                model="blb_lwm-ner-fine",
-                pipe=None,
-                base_model="khosseini/bert_1760_1900",
-                train_dataset=f"{dataset_path}/ner_fine_train.json",
-                test_dataset=f"{dataset_path}/ner_fine_dev.json",
-                model_path="../resources/models/",
-                training_args={
-                    "learning_rate": 5e-5,
-                    "batch_size": 16,
-                    "num_train_epochs": 4,
-                    "weight_decay": 0.01,
-                },
-                overwrite_training=False,
-                do_test=False,
-                load_from_hub=False,
+                model="Livingwithmachines/toponym-19thC-en",
+                load_from_hub=True,
             )
 
         # If myranker is None, instantiate the default Ranker.
@@ -257,7 +230,15 @@ def run_sentence(
         mentions_dataset = dict()
         mentions_dataset["linking"] = []
         for m in mentions:
-            prediction = self.format_prediction(m, sentence, wk_cands=wk_cands, context=context, sent_idx=sent_idx, place=place, place_wqid=place_wqid)
+            prediction = self.format_prediction(
+                m,
+                sentence,
+                wk_cands=wk_cands,
+                context=context,
+                sent_idx=sent_idx,
+                place=place,
+                place_wqid=place_wqid,
+            )
             mentions_dataset["linking"].append(prediction)
 
         # If the linking method is "reldisamb", rank and format candidates,
@@ -508,11 +489,7 @@ def run_text(
 
         return document_dataset
 
-
-    def run_sentence_recognition(
-            self,
-            sentence
-    ) -> List[dict]:
+    def run_sentence_recognition(self, sentence) -> List[dict]:
         # Get predictions:
         predictions = self.myner.ner_predict(sentence)
 
@@ -525,15 +502,16 @@ def run_sentence_recognition(
         # Aggregate mentions:
         mentions = ner.aggregate_mentions(procpreds, "pred")
         return mentions
-
-
-    def format_prediction(self, mention,
-                          sentence: str, 
-                          wk_cands: Optional[dict] = None,
-                          context: Optional[Tuple[str, str]] = ("", ""), 
-                          sent_idx: Optional[int] = 0,
-                          place: Optional[str] = "",
-                          place_wqid: Optional[str] = ""
+
+    def format_prediction(
+        self,
+        mention,
+        sentence: str,
+        wk_cands: Optional[dict] = None,
+        context: Optional[Tuple[str, str]] = ("", ""),
+        sent_idx: Optional[int] = 0,
+        place: Optional[str] = "",
+        place_wqid: Optional[str] = "",
     ) -> dict:
         prediction = dict()
         prediction["mention"] = mention["mention"]
@@ -551,12 +529,12 @@ def format_prediction(self, mention,
         prediction["place"] = place
         prediction["place_wqid"] = place_wqid
         if wk_cands:
-            prediction["string_match_candidates"] = wk_cands.get(mention["mention"], dict())
+            prediction["string_match_candidates"] = wk_cands.get(
+                mention["mention"], dict()
+            )
             prediction["candidates"] = wk_cands.get(mention["mention"], dict())
         return prediction
 
-
-
     def run_text_recognition(
         self,
         text: str,
@@ -627,7 +605,15 @@ def run_text_recognition(
 
             mentions_dataset = []
             for m in mentions:
-                prediction = self.format_prediction(m, sentence, wk_cands=None, context=context, sent_idx=idx, place=place, place_wqid=place_wqid)
+                prediction = self.format_prediction(
+                    m,
+                    sentence,
+                    wk_cands=None,
+                    context=context,
+                    sent_idx=idx,
+                    place=place,
+                    place_wqid=place_wqid,
+                )
                 # mentions_dataset["linking"].append(prediction)
                 if not len(m["mention"]) == 1 and not m["mention"].islower():
                     mentions_dataset.append(prediction)

diff --git a/tests/test_ner.py b/tests/test_ner.py
@@ -22,8 +22,7 @@ def test_training():
 
     myner = recogniser.Recogniser(
         model="blb_lwm-ner-coarse",  # NER model name prefix (will have suffixes appended)
-        pipe=None,  # We'll store the NER pipeline here
-        base_model="khosseini/bert_1760_1900",  # Base model to fine-tune (from huggingface)
+        base_model="Livingwithmachines/bert_1760_1900",  # Base model to fine-tune (from huggingface)
         train_dataset="experiments/outputs/data/lwm/ner_coarse_train.json",  # Training set (part of overall training set)
         test_dataset="experiments/outputs/data/lwm/ner_coarse_dev.json",  # Test set (part of overall training set)
         model_path="resources/models/",  # Path where the NER model is or will be stored
@@ -48,8 +47,7 @@ def test_create_pipeline():
     """
     myner = recogniser.Recogniser(
         model="blb_lwm-ner-coarse",  # NER model name prefix (will have suffixes appended)
-        pipe=None,  # We'll store the NER pipeline here
-        base_model="khosseini/bert_1760_1900",  # Base model to fine-tune (from huggingface)
+        base_model="Livingwithmachines/bert_1760_1900",  # Base model to fine-tune (from huggingface)
         train_dataset="experiments/outputs/data/lwm/ner_fine_train.json",  # Training set (part of overall training set)
         test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json",  # Test set (part of overall training set)
         model_path="resources/models/",  # Path where the NER model is or will be stored
@@ -73,8 +71,7 @@ def test_create_pipeline():
 def test_ner_predict():
     myner = recogniser.Recogniser(
         model="blb_lwm-ner-fine",  # NER model name prefix (will have suffixes appended)
-        pipe=None,  # We'll store the NER pipeline here
-        base_model="khosseini/bert_1760_1900",  # Base model to fine-tune (from huggingface)
+        base_model="Livingwithmachines/bert_1760_1900",  # Base model to fine-tune (from huggingface)
         train_dataset="experiments/outputs/data/lwm/ner_fine_train.json",  # Training set (part of overall training set)
         test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json",  # Test set (part of overall training set)
         model_path="resources/models/",  # Path where the NER model is or will be stored
@@ -107,15 +104,7 @@ def test_ner_predict():
 
 def test_ner_load_from_hub():
     myner = recogniser.Recogniser(
-        model="dslim/bert-base-NER",  # Test loading from huggingface hub
-        pipe=None,  # We'll store the NER pipeline here
-        base_model="khosseini/bert_1760_1900",  # Base model to fine-tune (from huggingface)
-        train_dataset="experiments/outputs/data/lwm/ner_fine_train.json",  # Training set (part of overall training set)
-        test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json",  # Test set (part of overall training set)
-        model_path="resources/models/",  # Path where the NER model is or will be stored
-        training_args=dict(),
-        overwrite_training=False,  # Set to True if you want to overwrite model if existing
-        do_test=False,  # Set to True if you want to train on test mode
+        model="Livingwithmachines/toponym-19thC-en",
         load_from_hub=True,
     )
     pipe = myner.create_pipeline()
@@ -128,8 +117,7 @@ def test_ner_load_from_hub():
 def test_aggregate_mentions():
     myner = recogniser.Recogniser(
         model="blb_lwm-ner-fine",  # NER model name prefix (will have suffixes appended)
-        pipe=None,  # We'll store the NER pipeline here
-        base_model="khosseini/bert_1760_1900",  # Base model to fine-tune (from huggingface)
+        base_model="Livingwithmachines/bert_1760_1900",  # Base model to fine-tune (from huggingface)
         train_dataset="experiments/outputs/data/lwm/ner_fine_train.json",  # Training set (part of overall training set)
         test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json",  # Test set (part of overall training set)
         model_path="resources/models/",  # Path where the NER model is or will be stored