Initial commit
This commit is contained in:
22
api/core/index/readers/html_parser.py
Normal file
22
api/core/index/readers/html_parser.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from llama_index.readers.file.base_parser import BaseParser
|
||||
|
||||
|
||||
class HTMLParser(BaseParser):
|
||||
"""HTML parser."""
|
||||
|
||||
def _init_parser(self) -> Dict:
|
||||
"""Init parser."""
|
||||
return {}
|
||||
|
||||
def parse_file(self, file: Path, errors: str = "ignore") -> str:
|
||||
"""Parse file."""
|
||||
with open(file, "rb") as fp:
|
||||
soup = BeautifulSoup(fp, 'html.parser')
|
||||
text = soup.get_text()
|
||||
text = text.strip() if text else ''
|
||||
|
||||
return text
|
Reference in New Issue
Block a user