Skip to content

Commit

Permalink
Disable PyTorch Compile Multiprocessing (#34)
Browse files Browse the repository at this point in the history
* Move tokenizer import

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Reduce inductor threads

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Change env int to string

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Change location of env var

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add comment linking issue

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

---------

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
  • Loading branch information
ryantwolf authored Apr 22, 2024
1 parent a8f8768 commit 0bbc77e
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 1 deletion.
3 changes: 2 additions & 1 deletion nemo_curator/filters/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import numpy as np
from bs4 import BeautifulSoup
from comment_parser import comment_parser
from nemo.collections.common.tokenizers import SentencePieceTokenizer

from nemo_curator.filters.doc_filter import DocumentFilter, import_filter
from nemo_curator.utils.constants import regex_alpha, regex_alphanum
Expand Down Expand Up @@ -104,6 +103,8 @@ def keep_document(self, score):
class TokenizerFertilityFilter(DocumentFilter):

def __init__(self, path_to_tokenizer=None, min_char_to_token_ratio=2.5):
from nemo.collections.common.tokenizers import SentencePieceTokenizer

if path_to_tokenizer is None:
raise ValueError(
"Must provide a valid path to a SentencePiece " "tokenizer"
Expand Down
7 changes: 7 additions & 0 deletions nemo_curator/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os

# Disables multiprocessing in torch.compile calls.
# Without this, Dasks multiprocessing combined with PyTorch's
# gives errors like "daemonic processes are not allowed to have children"
# See https://github.com/NVIDIA/NeMo-Curator/issues/31
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"

from .add_id import AddId
from .exact_dedup import ExactDuplicates
Expand Down

0 comments on commit 0bbc77e

Please sign in to comment.