Skip to content

Commit

Permalink
feat: add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR config pa…
Browse files Browse the repository at this point in the history
…rameteres (Unstructured-IO#3014)

This PR introduces GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR
controlling where temporary files are stored during partition flow, via
tempfile.tempdir.

#### Edit:
Renamed prefixes from STORAGE_ to UNSTRUCTURED_CACHE_

#### Edit 2:
Renamed prefixes from UNSTRUCTURED_CACHE to GLOBAL_WORKING_DIR_
  • Loading branch information
amadeusz-ds authored May 17, 2024
1 parent ec987dc commit 1c8b2b2
Show file tree
Hide file tree
Showing 9 changed files with 122 additions and 7 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.14.0-dev14
## 0.14.0-dev15

### BREAKING CHANGES

Expand All @@ -9,6 +9,7 @@
* **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text can be extracted.
* **Faster evaluation** Support for concurrent processing of documents during evaluation
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
* **Add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR** configuration parameteres to control temporary storage.

### Features
* **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`.
Expand Down
11 changes: 9 additions & 2 deletions test_unstructured/partition/pdf_image/test_pdf_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,9 @@ def test_save_elements(
assert not el.metadata.image_mime_type


def test_save_elements_with_output_dir_path_none():
@pytest.mark.parametrize("storage_enabled", [False, True])
def test_save_elements_with_output_dir_path_none(monkeypatch, storage_enabled):
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", storage_enabled)
with (
patch("PIL.Image.open"),
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"),
Expand All @@ -161,7 +163,12 @@ def test_save_elements_with_output_dir_path_none():
)

# Verify that the images are saved in the expected directory
expected_output_dir = os.path.join(tmpdir, "figures")
if storage_enabled:
from unstructured.partition.utils.config import env_config

expected_output_dir = os.path.join(env_config.GLOBAL_WORKING_PROCESS_DIR, "figures")
else:
expected_output_dir = os.path.join(tmpdir, "figures")
assert os.path.exists(expected_output_dir)
assert os.path.isdir(expected_output_dir)
os.chdir(original_cwd)
Expand Down
47 changes: 47 additions & 0 deletions test_unstructured/partition/utils/test_config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
import shutil
import tempfile
from pathlib import Path

import pytest


def test_default_config():
from unstructured.partition.utils.config import env_config

Expand All @@ -9,3 +16,43 @@ def test_env_override(monkeypatch):
from unstructured.partition.utils.config import env_config

assert env_config.IMAGE_CROP_PAD == 1


@pytest.fixture()
def _setup_tmpdir():
from unstructured.partition.utils.config import env_config

_tmpdir = tempfile.tempdir
_storage_tmpdir = env_config.GLOBAL_WORKING_PROCESS_DIR
_storage_tmpdir_bak = f"{env_config.GLOBAL_WORKING_PROCESS_DIR}_bak"
if Path(_storage_tmpdir).is_dir():
shutil.move(_storage_tmpdir, _storage_tmpdir_bak)
tempfile.tempdir = None
yield
if Path(_storage_tmpdir_bak).is_dir():
if Path(_storage_tmpdir).is_dir():
shutil.rmtree(_storage_tmpdir)
shutil.move(_storage_tmpdir_bak, _storage_tmpdir)
tempfile.tempdir = _tmpdir


@pytest.mark.usefixtures("_setup_tmpdir")
def test_env_storage_disabled(monkeypatch):
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", "false")
from unstructured.partition.utils.config import env_config

assert not env_config.GLOBAL_WORKING_DIR_ENABLED
assert str(Path.home() / ".cache/unstructured") == env_config.GLOBAL_WORKING_DIR
assert not Path(env_config.GLOBAL_WORKING_PROCESS_DIR).is_dir()
assert tempfile.gettempdir() != env_config.GLOBAL_WORKING_PROCESS_DIR


@pytest.mark.usefixtures("_setup_tmpdir")
def test_env_storage_enabled(monkeypatch):
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", "true")
from unstructured.partition.utils.config import env_config

assert env_config.GLOBAL_WORKING_DIR_ENABLED
assert str(Path.home() / ".cache/unstructured") == env_config.GLOBAL_WORKING_DIR
assert Path(env_config.GLOBAL_WORKING_PROCESS_DIR).is_dir()
assert tempfile.gettempdir() == env_config.GLOBAL_WORKING_PROCESS_DIR
4 changes: 4 additions & 0 deletions unstructured/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .partition.utils.config import env_config

# init env_config
env_config
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.14.0-dev14" # pragma: no cover
__version__ = "0.14.0-dev15" # pragma: no cover
1 change: 0 additions & 1 deletion unstructured/metrics/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,6 @@ def _try_process_document(self, doc: Path) -> Optional[list]:
@abstractmethod
def _process_document(self, doc: Path) -> list:
"""Should return all metadata and metrics for a single document."""
pass


@dataclass
Expand Down
9 changes: 9 additions & 0 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
import re
import warnings
from pathlib import Path
from typing import IO, TYPE_CHECKING, Any, Iterator, Optional, cast

import numpy as np
Expand Down Expand Up @@ -438,6 +439,14 @@ def _partition_pdf_or_image_local(
)

if analysis:
if not analyzed_image_output_dir_path:
if env_config.GLOBAL_WORKING_DIR_ENABLED:
analyzed_image_output_dir_path = str(
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
)
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
annotate_layout_elements(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
Expand Down
7 changes: 5 additions & 2 deletions unstructured/partition/pdf_image/pdf_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import tempfile
from copy import deepcopy
from io import BytesIO
from pathlib import PurePath
from pathlib import Path, PurePath
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Tuple, Union, cast

import cv2
Expand Down Expand Up @@ -131,7 +131,10 @@ def save_elements(
"""

if not output_dir_path:
output_dir_path = os.path.join(os.getcwd(), "figures")
if env_config.GLOBAL_WORKING_DIR_ENABLED:
output_dir_path = str(Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "figures")
else:
output_dir_path = str(Path.cwd() / "figures")
os.makedirs(output_dir_path, exist_ok=True)

with tempfile.TemporaryDirectory() as temp_dir:
Expand Down
45 changes: 45 additions & 0 deletions unstructured/partition/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,28 @@
"""

import os
import tempfile
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path

from unstructured.partition.utils.constants import OCR_AGENT_TESSERACT


@lru_cache(maxsize=1)
def get_tempdir(dir: str) -> str:
tempdir = Path(dir) / f"tmp/{os.getpgid(0)}"
return str(tempdir)


@dataclass
class ENVConfig:
"""class for configuring enviorment parameters"""

def __post_init__(self):
if self.GLOBAL_WORKING_DIR_ENABLED:
self._setup_tmpdir(self.GLOBAL_WORKING_PROCESS_DIR)

def _get_string(self, var: str, default_value: str = "") -> str:
"""attempt to get the value of var from the os environment; if not present return the
default_value"""
Expand All @@ -31,6 +44,15 @@ def _get_float(self, var: str, default_value: float) -> float:
return float(value)
return default_value

def _get_bool(self, var: str, default_value: bool) -> bool:
if value := self._get_string(var):
return value.lower() in ("true", "1", "t")
return default_value

def _setup_tmpdir(self, tmpdir: str) -> None:
Path(tmpdir).mkdir(parents=True, exist_ok=True)
tempfile.tempdir = tmpdir

@property
def IMAGE_CROP_PAD(self) -> int:
"""extra image content to add around an identified element region; measured in pixels"""
Expand Down Expand Up @@ -117,5 +139,28 @@ def PDF_ANNOTATION_THRESHOLD(self) -> float:

return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9)

@property
def GLOBAL_WORKING_DIR_ENABLED(self) -> bool:
"""Enable usage of GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR."""
return self._get_bool("GLOBAL_WORKING_DIR_ENABLED", False)

@property
def GLOBAL_WORKING_DIR(self) -> str:
"""Path to Unstructured cache directory."""
return self._get_string("GLOBAL_WORKING_DIR", str(Path.home() / ".cache/unstructured"))

@property
def GLOBAL_WORKING_PROCESS_DIR(self) -> str:
"""Path to Unstructured cache tempdir. Overrides TMPDIR, TEMP and TMP.
Defaults to '{GLOBAL_WORKING_DIR}/tmp/{os.getpgid(0)}'.
"""
default_tmpdir = get_tempdir(dir=self.GLOBAL_WORKING_DIR)
tmpdir = self._get_string("GLOBAL_WORKING_PROCESS_DIR", default_tmpdir)
if tmpdir == "":
tmpdir = default_tmpdir
if self.GLOBAL_WORKING_DIR_ENABLED:
self._setup_tmpdir(tmpdir)
return tmpdir


env_config = ENVConfig()

0 comments on commit 1c8b2b2

Please sign in to comment.