Merge pull request #614 from NatLibFi/issue602-load-vocab-command

Implement load-vocab and list-vocab commands
NatLibFi · Sep 2, 2022 · 48b23f7 · 48b23f7
2 parents f632673 + bd21582
commit 48b23f7
Show file tree

Hide file tree

Showing 12 changed files with 366 additions and 57 deletions.
diff --git a/annif/cli.py b/annif/cli.py
@@ -19,6 +19,7 @@
 from annif.project import Access
 from annif.suggestion import SuggestionFilter, ListSuggestionResult
 from annif.exception import ConfigurationException, NotSupportedException
+from annif.exception import NotInitializedException
 from annif.util import metric_code
 
 logger = annif.logger
@@ -41,17 +42,32 @@ def get_project(project_id):
         sys.exit(1)
 
 
-def open_documents(paths, subject_index, language, docs_limit):
+def get_vocab(vocab_id):
+    """
+    Helper function to get a vocabulary by ID and bail out if it doesn't
+    exist"""
+    try:
+        return annif.registry.get_vocab(vocab_id,
+                                        min_access=Access.private)
+    except ValueError:
+        click.echo(
+            f"No vocabularies found with the id '{vocab_id}'.",
+            err=True)
+        sys.exit(1)
+
+
+def open_documents(paths, subject_index, vocab_lang, docs_limit):
     """Helper function to open a document corpus from a list of pathnames,
-    each of which is either a TSV file or a directory of TXT files. The
-    corpus will be returned as an instance of DocumentCorpus or
-    LimitingDocumentCorpus."""
+    each of which is either a TSV file or a directory of TXT files. For
+    directories with subjects in TSV files, the given vocabulary language
+    will be used to convert subject labels into URIs. The corpus will be
+    returned as an instance of DocumentCorpus or LimitingDocumentCorpus."""
 
     def open_doc_path(path, subject_index):
         """open a single path and return it as a DocumentCorpus"""
         if os.path.isdir(path):
             return annif.corpus.DocumentDirectory(path, subject_index,
-                                                  language,
+                                                  vocab_lang,
                                                   require_subjects=True)
         return annif.corpus.DocumentFile(path, subject_index)
 
@@ -165,6 +181,8 @@ def run_show_project(project_id):
     click.echo(f'Project ID:        {proj.project_id}')
     click.echo(f'Project Name:      {proj.name}')
     click.echo(f'Language:          {proj.language}')
+    click.echo(f'Vocabulary:        {proj.vocab.vocab_id}')
+    click.echo(f'Vocab language:    {proj.vocab_lang}')
     click.echo(f'Access:            {proj.access.name}')
     click.echo(f'Trained:           {proj.is_trained}')
     click.echo(f'Modification time: {proj.modification_time}')
@@ -181,7 +199,34 @@ def run_clear_project(project_id):
     proj.remove_model_data()
 
 
-@cli.command('loadvoc')
+@cli.command('list-vocabs')
+@common_options
+@click_log.simple_verbosity_option(logger, default='ERROR')
+def run_list_vocabs():
+    """
+    List available vocabularies.
+    """
+
+    template = "{0: <20}{1: <20}{2: >10}  {3: <6}"
+    header = template.format(
+        "Vocabulary ID", "Languages", "Size", "Loaded")
+    click.echo(header)
+    click.echo("-" * len(header))
+    for vocab in annif.registry.get_vocabs(
+            min_access=Access.private).values():
+        try:
+            languages = ','.join(sorted(vocab.languages))
+            size = len(vocab)
+            loaded = True
+        except NotInitializedException:
+            languages = '-'
+            size = '-'
+            loaded = False
+        click.echo(template.format(
+            vocab.vocab_id, languages, size, str(loaded)))
+
+
+@cli.command('loadvoc', deprecated=True)
 @click.argument('project_id')
 @click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
 @click.option('--force', '-f', default=False, is_flag=True,
@@ -214,10 +259,42 @@ def run_loadvoc(project_id, force, subjectfile):
         subjects = annif.corpus.SubjectFileCSV(subjectfile)
     else:
         # probably a TSV file
-        subjects = annif.corpus.SubjectFileTSV(subjectfile, proj.language)
+        subjects = annif.corpus.SubjectFileTSV(subjectfile, proj.vocab_lang)
     proj.vocab.load_vocabulary(subjects, force=force)
 
 
+@cli.command('load-vocab')
+@click.argument('vocab_id')
+@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
+@click.option('--language', '-L', help='Language of subject file')
+@click.option('--force', '-f', default=False, is_flag=True,
+              help='Replace existing vocabulary completely ' +
+                   'instead of updating it')
+@common_options
+def run_load_vocab(vocab_id, language, force, subjectfile):
+    """
+    Load a vocabulary from a subject file.
+    """
+    vocab = get_vocab(vocab_id)
+    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
+        # SKOS/RDF file supported by rdflib
+        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
+        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
+    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
+        # CSV file
+        subjects = annif.corpus.SubjectFileCSV(subjectfile)
+        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
+    else:
+        # probably a TSV file - we need to know its language
+        if not language:
+            click.echo("Please use --language option to set the language of " +
+                       "a TSV vocabulary.", err=True)
+            sys.exit(1)
+        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
+        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
+    vocab.load_vocabulary(subjects, force=force)
+
+
 @cli.command('train')
 @click.argument('project_id')
 @click.argument('paths', type=click.Path(exists=True), nargs=-1)
@@ -252,7 +329,7 @@ def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
         documents = 'cached'
     else:
         documents = open_documents(paths, proj.subjects,
-                                   proj.vocab.language, docs_limit)
+                                   proj.vocab_lang, docs_limit)
     proj.train(documents, backend_params, jobs)
 
 
@@ -275,7 +352,7 @@ def run_learn(project_id, paths, docs_limit, backend_param):
     proj = get_project(project_id)
     backend_params = parse_backend_params(backend_param, proj)
     documents = open_documents(paths, proj.subjects,
-                               proj.vocab.language, docs_limit)
+                               proj.vocab_lang, docs_limit)
     proj.learn(documents, backend_params)
 
 
@@ -303,7 +380,7 @@ def run_suggest(project_id, limit, threshold, backend_param):
             "<{}>\t{}\t{}".format(
                 subj.uri,
                 '\t'.join(filter(None,
-                                 (subj.labels[project.vocab.language],
+                                 (subj.labels[project.vocab_lang],
                                   subj.notation))),
                 hit.score))
 
@@ -334,7 +411,7 @@ def run_index(project_id, directory, suffix, force,
     hit_filter = SuggestionFilter(project.subjects, limit, threshold)
 
     for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
-            directory, project.subjects, project.language,
+            directory, project.subjects, project.vocab_lang,
             require_subjects=False):
         with open(docfilename, encoding='utf-8-sig') as docfile:
             text = docfile.read()
@@ -350,7 +427,7 @@ def run_index(project_id, directory, suffix, force,
                 subj = project.subjects[hit.subject_id]
                 line = "<{}>\t{}\t{}".format(
                     subj.uri,
-                    '\t'.join(filter(None, (subj.labels[project.language],
+                    '\t'.join(filter(None, (subj.labels[project.vocab_lang],
                                             subj.notation))),
                     hit.score)
                 click.echo(line, file=subjfile)
@@ -432,7 +509,7 @@ def run_eval(
             raise NotSupportedException(
                 "cannot open results-file for writing: " + str(e))
     docs = open_documents(paths, project.subjects,
-                          project.vocab.language, docs_limit)
+                          project.vocab_lang, docs_limit)
 
     jobs, pool_class = annif.parallel.get_pool(jobs)
 
@@ -449,7 +526,7 @@ def run_eval(
     template = "{0:<30}\t{1}"
     metrics = eval_batch.results(metrics=metric,
                                  results_file=results_file,
-                                 language=project.vocab.language)
+                                 language=project.vocab_lang)
     for metric, score in metrics.items():
         click.echo(template.format(metric + ":", score))
     if metrics_file:
@@ -484,7 +561,7 @@ def run_optimize(project_id, paths, docs_limit, backend_param):
 
     ndocs = 0
     docs = open_documents(paths, project.subjects,
-                          project.vocab.language, docs_limit)
+                          project.vocab_lang, docs_limit)
     for doc in docs.documents:
         raw_hits = project.suggest(doc.text, backend_params)
         hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
@@ -567,7 +644,7 @@ def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric,
     """
     proj = get_project(project_id)
     documents = open_documents(paths, proj.subjects,
-                               proj.vocab.language, docs_limit)
+                               proj.vocab_lang, docs_limit)
     click.echo(f"Looking for optimal hyperparameters using {trials} trials")
     rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
     click.echo(f"Got best {metric} score {rec.score:.4f} with:")

diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py
@@ -118,6 +118,10 @@ def load_subjects(self, corpus):
     def __len__(self):
         return len(self._subjects)
 
+    @property
+    def languages(self):
+        return self._languages
+
     def __getitem__(self, subject_id):
         return self._subjects[subject_id]
 

diff --git a/annif/project.py b/annif/project.py
@@ -31,6 +31,7 @@ class AnnifProject(DatadirMixin):
     _analyzer = None
     _backend = None
     _vocab = None
+    _vocab_lang = None
     initialized = False
 
     # default values for configuration settings
@@ -148,17 +149,25 @@ def backend(self):
                     backend_id)
         return self._backend
 
+    def _initialize_vocab(self):
+        if self.vocab_spec is None:
+            raise ConfigurationException("vocab setting is missing",
+                                         project_id=self.project_id)
+        self._vocab, self._vocab_lang = self.registry.get_vocab(
+            self.vocab_spec, self.language)
+
     @property
     def vocab(self):
         if self._vocab is None:
-            if self.vocab_spec is None:
-                raise ConfigurationException("vocab setting is missing",
-                                             project_id=self.project_id)
-            self._vocab = self.registry.get_vocab(self.vocab_spec,
-                                                  self.language)
-
+            self._initialize_vocab()
         return self._vocab
 
+    @property
+    def vocab_lang(self):
+        if self._vocab_lang is None:
+            self._initialize_vocab()
+        return self._vocab_lang
+
     @property
     def subjects(self):
         return self.vocab.subjects

diff --git a/annif/registry.py b/annif/registry.py
@@ -5,6 +5,7 @@
 from flask import current_app
 import annif
 from annif.config import parse_config
+from annif.exception import ConfigurationException
 from annif.project import Access, AnnifProject
 from annif.vocab import AnnifVocabulary
 from annif.util import parse_args
@@ -71,8 +72,10 @@ def get_project(self, project_id, min_access=Access.private):
             raise ValueError("No such project {}".format(project_id))
 
     def get_vocab(self, vocab_spec, default_language):
-        """Return an AnnifVocabulary corresponding to the vocab_spec. If no
-        language information is specified, use the given default language."""
+        """Return an (AnnifVocabulary, language) pair corresponding to the
+        vocab_spec. If no language information is specified, use the given
+        default language."""
+
         match = re.match(r'(\w+)(\((.*)\))?', vocab_spec)
         if match is None:
             raise ValueError(
@@ -84,8 +87,8 @@ def get_vocab(self, vocab_spec, default_language):
 
         if vocab_key not in self._vocabs[self._rid]:
             self._vocabs[self._rid][vocab_key] = AnnifVocabulary(
-                vocab_id, self._datadir, language)
-        return self._vocabs[self._rid][vocab_key]
+                vocab_id, self._datadir)
+        return self._vocabs[self._rid][vocab_key], language
 
 
 def initialize_projects(app):
@@ -113,4 +116,29 @@ def get_project(project_id, min_access=Access.private):
     try:
         return projects[project_id]
     except KeyError:
-        raise ValueError("No such project {}".format(project_id))
+        raise ValueError(f"No such project '{project_id}'")
+
+
+def get_vocabs(min_access=Access.private):
+    """Return the available vocabularies as a dict of vocab_id ->
+    AnnifVocabulary. The min_access parameter may be used to set the minimum
+    access level required for the returned vocabularies."""
+
+    vocabs = {}
+    for proj in get_projects(min_access).values():
+        try:
+            vocabs[proj.vocab.vocab_id] = proj.vocab
+        except ConfigurationException:
+            pass
+
+    return vocabs
+
+
+def get_vocab(vocab_id, min_access=Access.private):
+    """return a single AnnifVocabulary by vocabulary id"""
+
+    vocabs = get_vocabs(min_access)
+    try:
+        return vocabs[vocab_id]
+    except KeyError:
+        raise ValueError(f"No such vocabulary '{vocab_id}'")
diff --git a/annif/rest.py b/annif/rest.py
@@ -75,7 +75,7 @@ def suggest(project_id, text, limit, threshold):
         return server_error(err)
     hits = hit_filter(result).as_list()
     return {'results': [_suggestion_to_dict(hit, project.subjects,
-                                            project.vocab.language)
+                                            project.vocab_lang)
                         for hit in hits]}
 
 

diff --git a/annif/vocab.py b/annif/vocab.py
@@ -22,10 +22,9 @@ class AnnifVocabulary(DatadirMixin):
     INDEX_FILENAME_TTL = "subjects.ttl"
     INDEX_FILENAME_CSV = "subjects.csv"
 
-    def __init__(self, vocab_id, datadir, language):
+    def __init__(self, vocab_id, datadir):
         DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
         self.vocab_id = vocab_id
-        self.language = language
         self._skos_vocab = None
 
     def _create_subject_index(self, subject_corpus):
@@ -98,20 +97,29 @@ def skos(self):
 
         raise NotInitializedException(f'graph file {path} not found')
 
+    def __len__(self):
+        return len(self.subjects)
+
+    @property
+    def languages(self):
+        return self.subjects.languages
+
     def load_vocabulary(self, subject_corpus, force=False):
         """Load subjects from a subject corpus and save them into one
         or more subject index files as well as a SKOS/Turtle file for later
         use. If force=True, replace the existing subject index completely."""
 
         if not force and os.path.exists(
                 os.path.join(self.datadir, self.INDEX_FILENAME_CSV)):
-            logger.info('updating existing vocabulary')
+            logger.info('updating existing subject index')
             self._subjects = self._update_subject_index(subject_corpus)
         else:
+            logger.info('creating subject index')
             self._subjects = self._create_subject_index(subject_corpus)
 
-        subject_corpus.save_skos(
-            os.path.join(self.datadir, self.INDEX_FILENAME_TTL))
+        skosfile = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
+        logger.info(f'saving vocabulary into SKOS file {skosfile}')
+        subject_corpus.save_skos(skosfile)
 
     def as_graph(self):
         """return the vocabulary as an rdflib graph"""