Source code for langchain_community.vectorstores.inmemory

import json
import uuid
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple

import numpy as np
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.load import dumpd, load
from langchain_core.vectorstores import VectorStore

from langchain_community.utils.math import cosine_similarity
from langchain_community.vectorstores.utils import maximal_marginal_relevance


[docs]class InMemoryVectorStore(VectorStore): """In-memory implementation of VectorStore using a dictionary. Uses numpy to compute cosine similarity for search. Args: embedding: embedding function to use. """
[docs] def __init__(self, embedding: Embeddings) -> None: self.store: Dict[str, Dict[str, Any]] = {} self.embedding = embedding
@property def embeddings(self) -> Embeddings: return self.embedding
[docs] def delete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None: if ids: for _id in ids: self.store.pop(_id, None)
[docs] async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None: self.delete(ids)
[docs] def add_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, ids: Optional[Sequence[str]] = None, **kwargs: Any, ) -> List[str]: """Add texts to the store.""" vectors = self.embedding.embed_documents(list(texts)) ids_ = [] for i, text in enumerate(texts): doc_id = ids[i] if ids else str(uuid.uuid4()) ids_.append(doc_id) self.store[doc_id] = { "id": doc_id, "vector": vectors[i], "text": text, "metadata": metadatas[i] if metadatas else {}, } return ids_
[docs] async def aadd_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> List[str]: return self.add_texts(texts, metadatas, **kwargs)
def _similarity_search_with_score_by_vector( self, embedding: List[float], k: int = 4, filter: Optional[Callable[[Document], bool]] = None, **kwargs: Any, ) -> List[Tuple[Document, float, List[float]]]: result = [] for doc in self.store.values(): vector = doc["vector"] similarity = float(cosine_similarity([embedding], [vector]).item(0)) result.append( ( Document(page_content=doc["text"], metadata=doc["metadata"]), similarity, vector, ) ) result.sort(key=lambda x: x[1], reverse=True) if filter is not None: result = [r for r in result if filter(r[0])] return result[:k]
[docs] def similarity_search_with_score_by_vector( self, embedding: List[float], k: int = 4, filter: Optional[Callable[[Document], bool]] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: return [ (doc, similarity) for doc, similarity, _ in self._similarity_search_with_score_by_vector( embedding=embedding, k=k, filter=filter, **kwargs ) ]
[docs] def similarity_search_with_score( self, query: str, k: int = 4, **kwargs: Any, ) -> List[Tuple[Document, float]]: embedding = self.embedding.embed_query(query) docs = self.similarity_search_with_score_by_vector( embedding, k, **kwargs, ) return docs
[docs] async def asimilarity_search_with_score( self, query: str, k: int = 4, **kwargs: Any ) -> List[Tuple[Document, float]]: return self.similarity_search_with_score(query, k, **kwargs)
[docs] def similarity_search_by_vector( self, embedding: List[float], k: int = 4, **kwargs: Any, ) -> List[Document]: docs_and_scores = self.similarity_search_with_score_by_vector( embedding, k, **kwargs, ) return [doc for doc, _ in docs_and_scores]
[docs] async def asimilarity_search_by_vector( self, embedding: List[float], k: int = 4, **kwargs: Any ) -> List[Document]: return self.similarity_search_by_vector(embedding, k, **kwargs)
[docs] def max_marginal_relevance_search_by_vector( self, embedding: List[float], k: int = 4, fetch_k: int = 20, lambda_mult: float = 0.5, **kwargs: Any, ) -> List[Document]: prefetch_hits = self._similarity_search_with_score_by_vector( embedding=embedding, k=fetch_k, **kwargs, ) mmr_chosen_indices = maximal_marginal_relevance( np.array(embedding, dtype=np.float32), [vector for _, _, vector in prefetch_hits], k=k, lambda_mult=lambda_mult, ) return [prefetch_hits[idx][0] for idx in mmr_chosen_indices]
[docs] @classmethod def from_texts( cls, texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> "InMemoryVectorStore": store = cls( embedding=embedding, ) store.add_texts(texts=texts, metadatas=metadatas, **kwargs) return store
[docs] @classmethod async def afrom_texts( cls, texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> "InMemoryVectorStore": return cls.from_texts(texts, embedding, metadatas, **kwargs)
[docs] @classmethod def load( cls, path: str, embedding: Embeddings, **kwargs: Any ) -> "InMemoryVectorStore": _path: Path = Path(path) with _path.open("r") as f: store = load(json.load(f)) vectorstore = cls(embedding=embedding, **kwargs) vectorstore.store = store return vectorstore
[docs] def dump(self, path: str) -> None: _path: Path = Path(path) _path.parent.mkdir(exist_ok=True, parents=True) with _path.open("w") as f: json.dump(dumpd(self.store), f, indent=2)