Source code for langchain_community.document_transformers.nuclia_text_transform

import asyncio
import json
import uuid
from typing import Any, Sequence

from langchain_core.documents import BaseDocumentTransformer, Document

from import NucliaUnderstandingAPI

[docs]class NucliaTextTransformer(BaseDocumentTransformer): """Nuclia Text Transformer. The Nuclia Understanding API splits into paragraphs and sentences, identifies entities, provides a summary of the text and generates embeddings for all sentences. """
[docs] def __init__(self, nua: NucliaUnderstandingAPI): self.nua = nua
[docs] def transform_documents( self, documents: Sequence[Document], **kwargs: Any ) -> Sequence[Document]: raise NotImplementedError
[docs] async def atransform_documents( self, documents: Sequence[Document], **kwargs: Any ) -> Sequence[Document]: tasks = [ self.nua.arun( { "action": "push", "id": str(uuid.uuid4()), "text": doc.page_content, "path": None, } ) for doc in documents ] results = await asyncio.gather(*tasks) for doc, result in zip(documents, results): obj = json.loads(result) metadata = { "file": obj["file_extracted_data"][0], "metadata": obj["field_metadata"][0], } doc.metadata["nuclia"] = metadata return documents