Source code for langchain_community.document_loaders.parsers.html.bs4

"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""

import logging
from typing import Any, Dict, Iterator, Union

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob

logger = logging.getLogger(__name__)

[docs]class BS4HTMLParser(BaseBlobParser): """Parse HTML files using `Beautiful Soup`."""
[docs] def __init__( self, *, features: str = "lxml", get_text_separator: str = "", **kwargs: Any, ) -> None: """Initialize a bs4 based HTML parser.""" try: import bs4 # noqa:F401 except ImportError: raise ImportError( "beautifulsoup4 package not found, please install it with " "`pip install beautifulsoup4`" ) self.bs_kwargs = {"features": features, **kwargs} self.get_text_separator = get_text_separator
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Load HTML document into document objects.""" from bs4 import BeautifulSoup with blob.as_bytes_io() as f: soup = BeautifulSoup(f, **self.bs_kwargs) text = soup.get_text(self.get_text_separator) if soup.title: title = str(soup.title.string) else: title = "" metadata: Dict[str, Union[str, None]] = { "source": blob.source, "title": title, } yield Document(page_content=text, metadata=metadata)