Skip to content

Commit

Permalink
Merge pull request #614 from NatLibFi/issue602-load-vocab-command
Browse files Browse the repository at this point in the history
Implement load-vocab and list-vocab commands
  • Loading branch information
osma authored Sep 2, 2022
2 parents f632673 + bd21582 commit 48b23f7
Show file tree
Hide file tree
Showing 12 changed files with 366 additions and 57 deletions.
109 changes: 93 additions & 16 deletions annif/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from annif.project import Access
from annif.suggestion import SuggestionFilter, ListSuggestionResult
from annif.exception import ConfigurationException, NotSupportedException
from annif.exception import NotInitializedException
from annif.util import metric_code

logger = annif.logger
Expand All @@ -41,17 +42,32 @@ def get_project(project_id):
sys.exit(1)


def open_documents(paths, subject_index, language, docs_limit):
def get_vocab(vocab_id):
"""
Helper function to get a vocabulary by ID and bail out if it doesn't
exist"""
try:
return annif.registry.get_vocab(vocab_id,
min_access=Access.private)
except ValueError:
click.echo(
f"No vocabularies found with the id '{vocab_id}'.",
err=True)
sys.exit(1)


def open_documents(paths, subject_index, vocab_lang, docs_limit):
"""Helper function to open a document corpus from a list of pathnames,
each of which is either a TSV file or a directory of TXT files. The
corpus will be returned as an instance of DocumentCorpus or
LimitingDocumentCorpus."""
each of which is either a TSV file or a directory of TXT files. For
directories with subjects in TSV files, the given vocabulary language
will be used to convert subject labels into URIs. The corpus will be
returned as an instance of DocumentCorpus or LimitingDocumentCorpus."""

def open_doc_path(path, subject_index):
"""open a single path and return it as a DocumentCorpus"""
if os.path.isdir(path):
return annif.corpus.DocumentDirectory(path, subject_index,
language,
vocab_lang,
require_subjects=True)
return annif.corpus.DocumentFile(path, subject_index)

Expand Down Expand Up @@ -165,6 +181,8 @@ def run_show_project(project_id):
click.echo(f'Project ID: {proj.project_id}')
click.echo(f'Project Name: {proj.name}')
click.echo(f'Language: {proj.language}')
click.echo(f'Vocabulary: {proj.vocab.vocab_id}')
click.echo(f'Vocab language: {proj.vocab_lang}')
click.echo(f'Access: {proj.access.name}')
click.echo(f'Trained: {proj.is_trained}')
click.echo(f'Modification time: {proj.modification_time}')
Expand All @@ -181,7 +199,34 @@ def run_clear_project(project_id):
proj.remove_model_data()


@cli.command('loadvoc')
@cli.command('list-vocabs')
@common_options
@click_log.simple_verbosity_option(logger, default='ERROR')
def run_list_vocabs():
"""
List available vocabularies.
"""

template = "{0: <20}{1: <20}{2: >10} {3: <6}"
header = template.format(
"Vocabulary ID", "Languages", "Size", "Loaded")
click.echo(header)
click.echo("-" * len(header))
for vocab in annif.registry.get_vocabs(
min_access=Access.private).values():
try:
languages = ','.join(sorted(vocab.languages))
size = len(vocab)
loaded = True
except NotInitializedException:
languages = '-'
size = '-'
loaded = False
click.echo(template.format(
vocab.vocab_id, languages, size, str(loaded)))


@cli.command('loadvoc', deprecated=True)
@click.argument('project_id')
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
@click.option('--force', '-f', default=False, is_flag=True,
Expand Down Expand Up @@ -214,10 +259,42 @@ def run_loadvoc(project_id, force, subjectfile):
subjects = annif.corpus.SubjectFileCSV(subjectfile)
else:
# probably a TSV file
subjects = annif.corpus.SubjectFileTSV(subjectfile, proj.language)
subjects = annif.corpus.SubjectFileTSV(subjectfile, proj.vocab_lang)
proj.vocab.load_vocabulary(subjects, force=force)


@cli.command('load-vocab')
@click.argument('vocab_id')
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
@click.option('--language', '-L', help='Language of subject file')
@click.option('--force', '-f', default=False, is_flag=True,
help='Replace existing vocabulary completely ' +
'instead of updating it')
@common_options
def run_load_vocab(vocab_id, language, force, subjectfile):
"""
Load a vocabulary from a subject file.
"""
vocab = get_vocab(vocab_id)
if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
# SKOS/RDF file supported by rdflib
subjects = annif.corpus.SubjectFileSKOS(subjectfile)
click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
# CSV file
subjects = annif.corpus.SubjectFileCSV(subjectfile)
click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
else:
# probably a TSV file - we need to know its language
if not language:
click.echo("Please use --language option to set the language of " +
"a TSV vocabulary.", err=True)
sys.exit(1)
click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
vocab.load_vocabulary(subjects, force=force)


@cli.command('train')
@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
Expand Down Expand Up @@ -252,7 +329,7 @@ def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
documents = 'cached'
else:
documents = open_documents(paths, proj.subjects,
proj.vocab.language, docs_limit)
proj.vocab_lang, docs_limit)
proj.train(documents, backend_params, jobs)


Expand All @@ -275,7 +352,7 @@ def run_learn(project_id, paths, docs_limit, backend_param):
proj = get_project(project_id)
backend_params = parse_backend_params(backend_param, proj)
documents = open_documents(paths, proj.subjects,
proj.vocab.language, docs_limit)
proj.vocab_lang, docs_limit)
proj.learn(documents, backend_params)


Expand Down Expand Up @@ -303,7 +380,7 @@ def run_suggest(project_id, limit, threshold, backend_param):
"<{}>\t{}\t{}".format(
subj.uri,
'\t'.join(filter(None,
(subj.labels[project.vocab.language],
(subj.labels[project.vocab_lang],
subj.notation))),
hit.score))

Expand Down Expand Up @@ -334,7 +411,7 @@ def run_index(project_id, directory, suffix, force,
hit_filter = SuggestionFilter(project.subjects, limit, threshold)

for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
directory, project.subjects, project.language,
directory, project.subjects, project.vocab_lang,
require_subjects=False):
with open(docfilename, encoding='utf-8-sig') as docfile:
text = docfile.read()
Expand All @@ -350,7 +427,7 @@ def run_index(project_id, directory, suffix, force,
subj = project.subjects[hit.subject_id]
line = "<{}>\t{}\t{}".format(
subj.uri,
'\t'.join(filter(None, (subj.labels[project.language],
'\t'.join(filter(None, (subj.labels[project.vocab_lang],
subj.notation))),
hit.score)
click.echo(line, file=subjfile)
Expand Down Expand Up @@ -432,7 +509,7 @@ def run_eval(
raise NotSupportedException(
"cannot open results-file for writing: " + str(e))
docs = open_documents(paths, project.subjects,
project.vocab.language, docs_limit)
project.vocab_lang, docs_limit)

jobs, pool_class = annif.parallel.get_pool(jobs)

Expand All @@ -449,7 +526,7 @@ def run_eval(
template = "{0:<30}\t{1}"
metrics = eval_batch.results(metrics=metric,
results_file=results_file,
language=project.vocab.language)
language=project.vocab_lang)
for metric, score in metrics.items():
click.echo(template.format(metric + ":", score))
if metrics_file:
Expand Down Expand Up @@ -484,7 +561,7 @@ def run_optimize(project_id, paths, docs_limit, backend_param):

ndocs = 0
docs = open_documents(paths, project.subjects,
project.vocab.language, docs_limit)
project.vocab_lang, docs_limit)
for doc in docs.documents:
raw_hits = project.suggest(doc.text, backend_params)
hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
Expand Down Expand Up @@ -567,7 +644,7 @@ def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric,
"""
proj = get_project(project_id)
documents = open_documents(paths, proj.subjects,
proj.vocab.language, docs_limit)
proj.vocab_lang, docs_limit)
click.echo(f"Looking for optimal hyperparameters using {trials} trials")
rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
click.echo(f"Got best {metric} score {rec.score:.4f} with:")
Expand Down
4 changes: 4 additions & 0 deletions annif/corpus/subject.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,10 @@ def load_subjects(self, corpus):
def __len__(self):
return len(self._subjects)

@property
def languages(self):
return self._languages

def __getitem__(self, subject_id):
return self._subjects[subject_id]

Expand Down
21 changes: 15 additions & 6 deletions annif/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class AnnifProject(DatadirMixin):
_analyzer = None
_backend = None
_vocab = None
_vocab_lang = None
initialized = False

# default values for configuration settings
Expand Down Expand Up @@ -148,17 +149,25 @@ def backend(self):
backend_id)
return self._backend

def _initialize_vocab(self):
if self.vocab_spec is None:
raise ConfigurationException("vocab setting is missing",
project_id=self.project_id)
self._vocab, self._vocab_lang = self.registry.get_vocab(
self.vocab_spec, self.language)

@property
def vocab(self):
if self._vocab is None:
if self.vocab_spec is None:
raise ConfigurationException("vocab setting is missing",
project_id=self.project_id)
self._vocab = self.registry.get_vocab(self.vocab_spec,
self.language)

self._initialize_vocab()
return self._vocab

@property
def vocab_lang(self):
if self._vocab_lang is None:
self._initialize_vocab()
return self._vocab_lang

@property
def subjects(self):
return self.vocab.subjects
Expand Down
38 changes: 33 additions & 5 deletions annif/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from flask import current_app
import annif
from annif.config import parse_config
from annif.exception import ConfigurationException
from annif.project import Access, AnnifProject
from annif.vocab import AnnifVocabulary
from annif.util import parse_args
Expand Down Expand Up @@ -71,8 +72,10 @@ def get_project(self, project_id, min_access=Access.private):
raise ValueError("No such project {}".format(project_id))

def get_vocab(self, vocab_spec, default_language):
"""Return an AnnifVocabulary corresponding to the vocab_spec. If no
language information is specified, use the given default language."""
"""Return an (AnnifVocabulary, language) pair corresponding to the
vocab_spec. If no language information is specified, use the given
default language."""

match = re.match(r'(\w+)(\((.*)\))?', vocab_spec)
if match is None:
raise ValueError(
Expand All @@ -84,8 +87,8 @@ def get_vocab(self, vocab_spec, default_language):

if vocab_key not in self._vocabs[self._rid]:
self._vocabs[self._rid][vocab_key] = AnnifVocabulary(
vocab_id, self._datadir, language)
return self._vocabs[self._rid][vocab_key]
vocab_id, self._datadir)
return self._vocabs[self._rid][vocab_key], language


def initialize_projects(app):
Expand Down Expand Up @@ -113,4 +116,29 @@ def get_project(project_id, min_access=Access.private):
try:
return projects[project_id]
except KeyError:
raise ValueError("No such project {}".format(project_id))
raise ValueError(f"No such project '{project_id}'")


def get_vocabs(min_access=Access.private):
"""Return the available vocabularies as a dict of vocab_id ->
AnnifVocabulary. The min_access parameter may be used to set the minimum
access level required for the returned vocabularies."""

vocabs = {}
for proj in get_projects(min_access).values():
try:
vocabs[proj.vocab.vocab_id] = proj.vocab
except ConfigurationException:
pass

return vocabs


def get_vocab(vocab_id, min_access=Access.private):
"""return a single AnnifVocabulary by vocabulary id"""

vocabs = get_vocabs(min_access)
try:
return vocabs[vocab_id]
except KeyError:
raise ValueError(f"No such vocabulary '{vocab_id}'")
2 changes: 1 addition & 1 deletion annif/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def suggest(project_id, text, limit, threshold):
return server_error(err)
hits = hit_filter(result).as_list()
return {'results': [_suggestion_to_dict(hit, project.subjects,
project.vocab.language)
project.vocab_lang)
for hit in hits]}


Expand Down
18 changes: 13 additions & 5 deletions annif/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,9 @@ class AnnifVocabulary(DatadirMixin):
INDEX_FILENAME_TTL = "subjects.ttl"
INDEX_FILENAME_CSV = "subjects.csv"

def __init__(self, vocab_id, datadir, language):
def __init__(self, vocab_id, datadir):
DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
self.vocab_id = vocab_id
self.language = language
self._skos_vocab = None

def _create_subject_index(self, subject_corpus):
Expand Down Expand Up @@ -98,20 +97,29 @@ def skos(self):

raise NotInitializedException(f'graph file {path} not found')

def __len__(self):
return len(self.subjects)

@property
def languages(self):
return self.subjects.languages

def load_vocabulary(self, subject_corpus, force=False):
"""Load subjects from a subject corpus and save them into one
or more subject index files as well as a SKOS/Turtle file for later
use. If force=True, replace the existing subject index completely."""

if not force and os.path.exists(
os.path.join(self.datadir, self.INDEX_FILENAME_CSV)):
logger.info('updating existing vocabulary')
logger.info('updating existing subject index')
self._subjects = self._update_subject_index(subject_corpus)
else:
logger.info('creating subject index')
self._subjects = self._create_subject_index(subject_corpus)

subject_corpus.save_skos(
os.path.join(self.datadir, self.INDEX_FILENAME_TTL))
skosfile = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
logger.info(f'saving vocabulary into SKOS file {skosfile}')
subject_corpus.save_skos(skosfile)

def as_graph(self):
"""return the vocabulary as an rdflib graph"""
Expand Down
Loading

0 comments on commit 48b23f7

Please sign in to comment.