And upgrade torch, numpy, pandas and accelerate as well as black, pyr… (

#100) * And upgrade torch, numpy, pandas and accelerate as well as black, pyright and flake8. To fix the OutOfMemory Error, but also to keep current. * torch 2.1.2
sillsdev · Feb 3, 2024 · 4267780 · 4267780
1 parent f915db0
commit 4267780
Show file tree

Hide file tree

Showing 90 changed files with 750 additions and 685 deletions.
diff --git a/.flake8 b/.flake8
@@ -1,5 +1,5 @@
 [flake8]
-max-line-length = 120
+max-line-length = 125
 extend-ignore = E203
 per-file-ignores = **/*.pyi:E252,E301,E302,E305,E501,E701,E704,E741,F401,F811,F821
 exclude =

diff --git a/machine/annotations/range.py b/machine/annotations/range.py
@@ -69,25 +69,21 @@ def __repr__(self) -> str:
 class _RangeFactory(ABC, Generic[Offset]):
     @property
     @abstractmethod
-    def include_endpoint(self) -> bool:
-        ...
+    def include_endpoint(self) -> bool: ...
 
     def create(self, start: Offset, end: Optional[Offset]) -> Range[Offset]:
         if end is None:
             end = start
         return Range(self, start, end)
 
     @abstractmethod
-    def get_length(self, start: Offset, end: Offset) -> int:
-        ...
+    def get_length(self, start: Offset, end: Offset) -> int: ...
 
     @abstractmethod
-    def iterate(self, start: Offset, end: Offset) -> Iterable[Offset]:
-        ...
+    def iterate(self, start: Offset, end: Offset) -> Iterable[Offset]: ...
 
     @abstractmethod
-    def offset_compare(self, x: Offset, y: Offset) -> int:
-        ...
+    def offset_compare(self, x: Offset, y: Offset) -> int: ...
 
 
 class _IntRangeFactory(_RangeFactory[int]):

diff --git a/machine/clusterers/cluster.py b/machine/clusterers/cluster.py
@@ -5,12 +5,10 @@
 
 class Cluster(Generic[T]):
     @overload
-    def __init__(self, *data_objects: T, noise: bool = False, description: Optional[str] = None) -> None:
-        ...
+    def __init__(self, *data_objects: T, noise: bool = False, description: Optional[str] = None) -> None: ...
 
     @overload
-    def __init__(self, data_objects: Iterable[T], noise: bool = False, description: Optional[str] = None) -> None:
-        ...
+    def __init__(self, data_objects: Iterable[T], noise: bool = False, description: Optional[str] = None) -> None: ...
 
     def __init__(self, *args, **kwargs) -> None:
         self._data_objects: FrozenSet[T]

diff --git a/machine/clusterers/flat_clusterer.py b/machine/clusterers/flat_clusterer.py
@@ -8,5 +8,4 @@
 
 class FlatClusterer(ABC, Generic[T]):
     @abstractmethod
-    def generate_clusters(self, data_objects: Iterable[T]) -> Iterable[Cluster[T]]:
-        ...
+    def generate_clusters(self, data_objects: Iterable[T]) -> Iterable[Cluster[T]]: ...
diff --git a/machine/clusterers/rooted_hierarchical_clusterer.py b/machine/clusterers/rooted_hierarchical_clusterer.py
@@ -12,5 +12,4 @@
 
 class RootedHierarchicalClusterer(ABC, Generic[T]):
     @abstractmethod
-    def generate_clusters(self, data_objects: Iterable[T]) -> DiGraph[Cluster[T]]:
-        ...
+    def generate_clusters(self, data_objects: Iterable[T]) -> DiGraph[Cluster[T]]: ...
diff --git a/machine/clusterers/unrooted_hierarchical_clusterer.py b/machine/clusterers/unrooted_hierarchical_clusterer.py
@@ -12,5 +12,4 @@
 
 class UnrootedHierarchicalClusterer(ABC, Generic[T]):
     @abstractmethod
-    def generate_clusters(self, data_objects: Iterable[T]) -> Graph[Cluster[T]]:
-        ...
+    def generate_clusters(self, data_objects: Iterable[T]) -> Graph[Cluster[T]]: ...
diff --git a/machine/clusterers/upgma_clusterer.py b/machine/clusterers/upgma_clusterer.py
@@ -97,5 +97,6 @@ def get_all_data_objects_count(tree: DiGraph[Cluster[T]], cluster: Cluster[T]) -
     if tree.out_degree(cluster) == 0:
         return len(cluster.data_objects)
     return sum(
-        (get_all_data_objects_count(tree, edge[1]) for edge in tree.out_edges(cluster)), len(cluster.data_objects)
+        (get_all_data_objects_count(tree, edge[1]) for edge in tree.out_edges(cluster)),
+        len(cluster.data_objects),
     )
diff --git a/machine/corpora/alignment_collection.py b/machine/corpora/alignment_collection.py
@@ -7,10 +7,8 @@
 class AlignmentCollection(Corpus[AlignmentRow]):
     @property
     @abstractmethod
-    def id(self) -> str:
-        ...
+    def id(self) -> str: ...
 
     @property
     @abstractmethod
-    def sort_key(self) -> str:
-        ...
+    def sort_key(self) -> str: ...
diff --git a/machine/corpora/alignment_corpus.py b/machine/corpora/alignment_corpus.py
@@ -13,8 +13,7 @@
 class AlignmentCorpus(Corpus[AlignmentRow]):
     @property
     @abstractmethod
-    def alignment_collections(self) -> Iterable[AlignmentCollection]:
-        ...
+    def alignment_collections(self) -> Iterable[AlignmentCollection]: ...
 
     def get_rows(self, text_ids: Optional[Iterable[str]] = None) -> ContextManagedGenerator[AlignmentRow, None, None]:
         return ContextManagedGenerator(self._get_rows(text_ids))

diff --git a/machine/corpora/corpus.py b/machine/corpora/corpus.py
@@ -17,8 +17,7 @@ def get_rows(self) -> ContextManagedGenerator[Row, None, None]:
         return ContextManagedGenerator(self._get_rows())
 
     @abstractmethod
-    def _get_rows(self) -> Generator[Row, None, None]:
-        ...
+    def _get_rows(self) -> Generator[Row, None, None]: ...
 
     def __iter__(self) -> ContextManagedGenerator[Row, None, None]:
         return self.get_rows()
@@ -28,7 +27,11 @@ def count(self, include_empty: bool = True) -> int:
             return sum(1 for row in rows if include_empty or not row.is_empty)
 
     def interleaved_split(
-        self, percent: Optional[float] = None, size: Optional[int] = None, include_empty: bool = True, seed: Any = None
+        self,
+        percent: Optional[float] = None,
+        size: Optional[int] = None,
+        include_empty: bool = True,
+        seed: Any = None,
     ) -> Tuple[ContextManagedGenerator[Tuple[Row, bool], None, None], int, int]:
         corpus_size = self.count(include_empty)
         split_indices = get_split_indices(corpus_size, percent, size, seed)

diff --git a/machine/corpora/dictionary_alignment_corpus.py b/machine/corpora/dictionary_alignment_corpus.py
@@ -6,12 +6,10 @@
 
 class DictionaryAlignmentCorpus(AlignmentCorpus):
     @overload
-    def __init__(self, *alignment_collections: AlignmentCollection) -> None:
-        ...
+    def __init__(self, *alignment_collections: AlignmentCollection) -> None: ...
 
     @overload
-    def __init__(self, alignment_collections: Iterable[AlignmentCollection]) -> None:
-        ...
+    def __init__(self, alignment_collections: Iterable[AlignmentCollection]) -> None: ...
 
     def __init__(self, *args, **kwargs) -> None:
         alignment_collections: Iterable[AlignmentCollection]

diff --git a/machine/corpora/dictionary_text_corpus.py b/machine/corpora/dictionary_text_corpus.py
@@ -6,12 +6,10 @@
 
 class DictionaryTextCorpus(TextCorpus):
     @overload
-    def __init__(self, *texts: Text) -> None:
-        ...
+    def __init__(self, *texts: Text) -> None: ...
 
     @overload
-    def __init__(self, texts: Iterable[Text]) -> None:
-        ...
+    def __init__(self, texts: Iterable[Text]) -> None: ...
 
     def __init__(self, *args, **kwargs) -> None:
         texts: Iterable[Text]

diff --git a/machine/corpora/file_stream_container.py b/machine/corpora/file_stream_container.py
@@ -13,11 +13,9 @@ def __init__(self, filename: StrPath) -> None:
     def __enter__(self) -> FileStreamContainer:
         return self
 
-    def __exit__(self, type: Any, value: Any, traceback: Any) -> None:
-        ...
+    def __exit__(self, type: Any, value: Any, traceback: Any) -> None: ...
 
     def open_stream(self) -> BinaryIO:
         return open(self._filename, "rb")
 
-    def close(self) -> None:
-        ...
+    def close(self) -> None: ...
diff --git a/machine/corpora/flatten.py b/machine/corpora/flatten.py
@@ -13,18 +13,15 @@
 
 
 @overload
-def flatten(corpora: Iterable[TextCorpus]) -> TextCorpus:
-    ...
+def flatten(corpora: Iterable[TextCorpus]) -> TextCorpus: ...
 
 
 @overload
-def flatten(corpora: Iterable[AlignmentCorpus]) -> AlignmentCorpus:
-    ...
+def flatten(corpora: Iterable[AlignmentCorpus]) -> AlignmentCorpus: ...
 
 
 @overload
-def flatten(corpora: Iterable[ParallelTextCorpus]) -> ParallelTextCorpus:
-    ...
+def flatten(corpora: Iterable[ParallelTextCorpus]) -> ParallelTextCorpus: ...
 
 
 def flatten(corpora: Iterable[Corpus]) -> Corpus:
@@ -35,7 +32,7 @@ def flatten(corpora: Iterable[Corpus]) -> Corpus:
     if len(corpus_list) == 1:
         return corpus_list[0]
 
-    if any(type(corpus_list[0]) != type(corpus) for corpus in corpus_list[1:]):
+    if any(type(corpus_list[0]) != type(corpus) for corpus in corpus_list[1:]):  # noqa: E721
         raise TypeError("All corpora must be of the same type.")
 
     if isinstance(corpus_list[0], TextCorpus):

diff --git a/machine/corpora/parallel_text_corpus.py b/machine/corpora/parallel_text_corpus.py
@@ -87,13 +87,11 @@ def from_hf_dataset(
 
     @property
     @abstractmethod
-    def is_source_tokenized(self) -> bool:
-        ...
+    def is_source_tokenized(self) -> bool: ...
 
     @property
     @abstractmethod
-    def is_target_tokenized(self) -> bool:
-        ...
+    def is_target_tokenized(self) -> bool: ...
 
     def invert(self) -> ParallelTextCorpus:
         def _invert(row: ParallelTextRow) -> ParallelTextRow:
@@ -304,7 +302,11 @@ def take(self, count: int) -> ParallelTextCorpus:
         return _TakeParallelTextCorpus(self, count)
 
     def split(
-        self, percent: Optional[float] = None, size: Optional[int] = None, include_empty: bool = True, seed: Any = None
+        self,
+        percent: Optional[float] = None,
+        size: Optional[int] = None,
+        include_empty: bool = True,
+        seed: Any = None,
     ) -> Tuple[ParallelTextCorpus, ParallelTextCorpus, int, int]:
         corpus_size = self.count(include_empty)
         split_indices = get_split_indices(corpus_size, percent, size, seed)
@@ -594,7 +596,12 @@ def _get_rows(self) -> Generator[ParallelTextRow, None, None]:
                     AlignedWordPair.from_string(v) if isinstance(v, str) else [AlignedWordPair(t[0], t[1]) for t in v]
                 )
             yield ParallelTextRow(
-                text_id, refs, refs, [source] if len(source) > 0 else [], [target] if len(target) > 0 else [], alignment
+                text_id,
+                refs,
+                refs,
+                [source] if len(source) > 0 else [],
+                [target] if len(target) > 0 else [],
+                alignment,
             )
 
 
@@ -670,7 +677,12 @@ def _get_rows(self) -> Generator[ParallelTextRow, None, None]:
                 alignment = [AlignedWordPair(si, ti) for (si, ti) in zip(src_indices, trg_indices)]
 
             yield ParallelTextRow(
-                text_id, refs, refs, [source] if len(source) > 0 else [], [target] if len(target) > 0 else [], alignment
+                text_id,
+                refs,
+                refs,
+                [source] if len(source) > 0 else [],
+                [target] if len(target) > 0 else [],
+                alignment,
             )
             index += 1
 

diff --git a/machine/corpora/scripture_text_corpus.py b/machine/corpora/scripture_text_corpus.py
@@ -33,7 +33,9 @@ def _get_rows(self) -> Generator[TextRow, None, None]:
                     yield from self._create_rows(vref)
 
 
-def create_versification_ref_corpus(versification: Versification = ORIGINAL_VERSIFICATION) -> ScriptureTextCorpus:
+def create_versification_ref_corpus(
+    versification: Versification = ORIGINAL_VERSIFICATION,
+) -> ScriptureTextCorpus:
     return ScriptureTextCorpus(
         versification,
         (

diff --git a/machine/corpora/standard_parallel_text_corpus.py b/machine/corpora/standard_parallel_text_corpus.py
@@ -181,9 +181,11 @@ def _get_rows(self) -> Generator[ParallelTextRow, None, None]:
                             range_info,
                             src_row,
                             trg_row,
-                            alignment.aligned_word_pairs
-                            if alignment is not None and src_row.ref == alignment.ref
-                            else None,
+                            (
+                                alignment.aligned_word_pairs
+                                if alignment is not None and src_row.ref == alignment.ref
+                                else None
+                            ),
                         )
 
                     source_same_ref_rows.append(src_row)

diff --git a/machine/corpora/stream_container.py b/machine/corpora/stream_container.py
@@ -6,17 +6,13 @@
 
 class StreamContainer(ABC):
     @abstractmethod
-    def __enter__(self) -> StreamContainer:
-        ...
+    def __enter__(self) -> StreamContainer: ...
 
     @abstractmethod
-    def __exit__(self, type: Any, value: Any, traceback: Any) -> None:
-        ...
+    def __exit__(self, type: Any, value: Any, traceback: Any) -> None: ...
 
     @abstractmethod
-    def open_stream(self) -> BinaryIO:
-        ...
+    def open_stream(self) -> BinaryIO: ...
 
     @abstractmethod
-    def close(self) -> None:
-        ...
+    def close(self) -> None: ...
diff --git a/machine/corpora/text.py b/machine/corpora/text.py
@@ -7,10 +7,8 @@
 class Text(Corpus[TextRow]):
     @property
     @abstractmethod
-    def id(self) -> str:
-        ...
+    def id(self) -> str: ...
 
     @property
     @abstractmethod
-    def sort_key(self) -> str:
-        ...
+    def sort_key(self) -> str: ...
diff --git a/machine/corpora/text_corpus.py b/machine/corpora/text_corpus.py
@@ -19,13 +19,11 @@
 class TextCorpus(Corpus[TextRow]):
     @property
     @abstractmethod
-    def texts(self) -> Iterable[Text]:
-        ...
+    def texts(self) -> Iterable[Text]: ...
 
     @property
     @abstractmethod
-    def is_tokenized(self) -> bool:
-        ...
+    def is_tokenized(self) -> bool: ...
 
     def get_rows(self, text_ids: Optional[Iterable[str]] = None) -> ContextManagedGenerator[TextRow, None, None]:
         return ContextManagedGenerator(self._get_rows(text_ids))
@@ -132,7 +130,11 @@ def take(self, count: int) -> TextCorpus:
         return _TakeTextCorpus(self, count)
 
     def split(
-        self, percent: Optional[float] = None, size: Optional[int] = None, include_empty: bool = True, seed: Any = None
+        self,
+        percent: Optional[float] = None,
+        size: Optional[int] = None,
+        include_empty: bool = True,
+        seed: Any = None,
     ) -> Tuple[TextCorpus, TextCorpus, int, int]:
         corpus_size = self.count(include_empty)
         split_indices = get_split_indices(corpus_size, percent, size, seed)

diff --git a/machine/corpora/text_file_alignment_corpus.py b/machine/corpora/text_file_alignment_corpus.py
@@ -9,12 +9,10 @@
 
 class TextFileAlignmentCorpus(DictionaryAlignmentCorpus):
     @overload
-    def __init__(self, file_patterns: Iterable[StrPath]) -> None:
-        ...
+    def __init__(self, file_patterns: Iterable[StrPath]) -> None: ...
 
     @overload
-    def __init__(self, *file_patterns: StrPath) -> None:
-        ...
+    def __init__(self, *file_patterns: StrPath) -> None: ...
 
     def __init__(self, *args, **kwargs) -> None:
         file_patterns: Iterable[str]

diff --git a/machine/corpora/text_file_text_corpus.py b/machine/corpora/text_file_text_corpus.py
@@ -9,12 +9,10 @@
 
 class TextFileTextCorpus(DictionaryTextCorpus):
     @overload
-    def __init__(self, file_patterns: Iterable[StrPath]) -> None:
-        ...
+    def __init__(self, file_patterns: Iterable[StrPath]) -> None: ...
 
     @overload
-    def __init__(self, *file_patterns: StrPath) -> None:
-        ...
+    def __init__(self, *file_patterns: StrPath) -> None: ...
 
     def __init__(self, *args, **kwargs) -> None:
         file_patterns: Iterable[str]

diff --git a/machine/corpora/text_row.py b/machine/corpora/text_row.py
@@ -11,7 +11,11 @@ class TextRowFlags(Flag):
 
 class TextRow(Sequence[str]):
     def __init__(
-        self, text_id: str, ref: Any, segment: Sequence[str] = [], flags: TextRowFlags = TextRowFlags.SENTENCE_START
+        self,
+        text_id: str,
+        ref: Any,
+        segment: Sequence[str] = [],
+        flags: TextRowFlags = TextRowFlags.SENTENCE_START,
     ) -> None:
         self._text_id = text_id
         self._ref = ref