Skip to content

Commit

Permalink
added ingestion
Browse files Browse the repository at this point in the history
  • Loading branch information
emarco177 committed Apr 25, 2023
1 parent 0dcefb7 commit 235edd1
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 2 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ jinja2 = "*"
uvicorn = "*"
streamlit = "*"
streamlit-chat = "*"
tqdm = "*"

[dev-packages]

Expand Down
4 changes: 2 additions & 2 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 36 additions & 0 deletions ingestion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os

from langchain.document_loaders import ReadTheDocsLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
import pinecone

pinecone.init(
api_key=os.environ["PINECONE_API_KEY"],
environment=os.environ["PINECONE_ENVIRONMENT_REGION"],
)
INDEX_NAME = "langchain-doc-index"


def ingest_docs():
loader = ReadTheDocsLoader("langchain-docs/langchain.readthedocs.io/en/latest")
raw_documents = loader.load()
print(f"loaded {len(raw_documents)} documents")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, chunk_overlap=200, separators=["\n\n", "\n", " ", ""]
)
documents = text_splitter.split_documents(raw_documents)
for doc in documents:
new_url = doc.metadata["source"]
new_url = new_url.replace("langchain-docs", "https:/")
doc.metadata.update({"source": new_url})

embeddings = OpenAIEmbeddings()
print(f"Going to add {len(documents)} to Pinecone")
Pinecone.from_documents(documents, embeddings, index_name=INDEX_NAME)
print("****Loading to vectorestore done ***")


if __name__ == "__main__":
ingest_docs()

0 comments on commit 235edd1

Please sign in to comment.