diff --git a/Official_Acoount/LazyRAG_MinerU/rag_mineru.py b/Official_Acoount/LazyRAG_MinerU/rag_mineru.py new file mode 100644 index 0000000..9d9e54e --- /dev/null +++ b/Official_Acoount/LazyRAG_MinerU/rag_mineru.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +# flake8: noqa: F821 + +import lazyllm +from lazyllm import LOG +from lazyllm import pipeline, parallel, bind, OnlineEmbeddingModule, SentenceSplitter, Document, Retriever, Reranker +from utils.magic_pdf_reader import MagicPDFReader +from utils.magic_pdf_transform import MagicPDFTransform + +# 注意:使用前须配置LLM服务的API Key环境变量 如 export LAZYLLM_DEEPSEEK_API_KEY=""、export LAZYLLM_QWEN_API_KEY="" +prompt = 'You will play the role of an AI question-answering assistant and complete a conversation task in which you need to provide your answer based on the given context and question. Please note that if the given context cannot answer the question, do not use your prior knowledge but tell the user that the given context cannot answer the question.' + +documents = Document(dataset_path="", embed=OnlineEmbeddingModule(), manager=False) + +documents.add_reader("**/*.pdf", MagicPDFReader) # ⚠️添加magic-pdf作为pdf文档解析器 +documents.create_node_group(name="magic-pdf", transform=MagicPDFTransform) # ⚠️添加定制节点解析器 + +with pipeline() as ppl: + with parallel().sum as ppl.prl: + prl.retriever1 = Retriever(documents, group_name="magic-pdf", similarity="cosine", topk=3) + prl.retriever2 = Retriever(documents, group_name="magic-pdf", similarity="bm25_chinese", topk=3) + ppl.reranker = Reranker("ModuleReranker", model=OnlineEmbeddingModule(type="rerank"), topk=1, output_format='content', join=True) | bind(query=ppl.input) + ppl.formatter = (lambda nodes, query: dict(context_str=nodes, query=query)) | bind(query=ppl.input) + ppl.llm = lazyllm.OnlineChatModule(stream=False).prompt(lazyllm.ChatPrompter(prompt, extra_keys=["context_str"])) + + +if __name__ == "__main__": + while True: + print("✨ Welcome to your smart assistant ✨") + + query = input("\n🚀 Enter your query (type 'exit' to quit): \n> ") + if query.lower() == "exit": + print("\n👋 Exiting... Thank you for the using!") + break + + print(f"\n✅ Received your query: {query}\n") + + answer = ppl(query) + + print("\n" + "=" * 50) + print("🚀 ANSWER 🚀") + print("=" * 50 + "\n") + print(answer) + print("\n" + "=" * 50 + "\n") \ No newline at end of file diff --git a/Official_Acoount/LazyRAG_MinerU/rag_mineru_output.py b/Official_Acoount/LazyRAG_MinerU/rag_mineru_output.py new file mode 100644 index 0000000..53b660e --- /dev/null +++ b/Official_Acoount/LazyRAG_MinerU/rag_mineru_output.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# flake8: noqa: F821 + +import lazyllm +from lazyllm import LOG +from lazyllm import pipeline, parallel, bind, OnlineEmbeddingModule, SentenceSplitter, Document, Retriever, Reranker +from utils.magic_pdf_reader import MagicPDFReader +from utils.magic_pdf_transform import MagicPDFTransform +from utils.utils import draw_pdf_bbox + +# 注意:使用前须配置LLM服务的API Key环境变量 如 export LAZYLLM_DEEPSEEK_API_KEY=""、export LAZYLLM_QWEN_API_KEY="" +prompt = 'You will play the role of an AI question-answering assistant and complete a conversation task in which you need to provide your answer based on the given context and question. Please note that if the given context cannot answer the question, do not use your prior knowledge but tell the user that the given context cannot answer the question.' + +documents = Document(dataset_path="", embed=OnlineEmbeddingModule(), manager=False) + +documents.add_reader("**/*.pdf", MagicPDFReader) # ⚠️添加magic-pdf作为pdf文档解析器 +documents.create_node_group(name="magic-pdf", transform=MagicPDFTransform) # ⚠️添加定制节点解析器 + + +with pipeline() as ppl: + with parallel().sum as ppl.prl: + prl.retriever1 = Retriever(documents, group_name="magic-pdf", similarity="cosine", topk=3) + prl.retriever2 = Retriever(documents, group_name="magic-pdf", similarity="bm25_chinese", topk=3) + ppl.reranker = Reranker("ModuleReranker", model=OnlineEmbeddingModule(type="rerank"), topk=1) | bind(query=ppl.input) + ppl.draw_pdf = draw_pdf_bbox | bind(query=ppl.input) # ⚠️ 在pipeline中添加任务:在目标文件中框选召回内容并保存 + ppl.formatter = (lambda nodes, query: dict(context_str=nodes, query=query)) | bind(query=ppl.input) + ppl.llm = lazyllm.OnlineChatModule(stream=False).prompt(lazyllm.ChatPrompter(prompt, extra_keys=["context_str"])) + + +if __name__ == "__main__": + while True: + print("✨ Welcome to your smart assistant ✨") + + query = input("\n🚀 Enter your query (type 'exit' to quit): \n> ") + if query.lower() == "exit": + print("\n👋 Exiting... Thank you for the using!") + break + + print(f"\n✅ Received your query: {query}\n") + + answer = ppl(query) + + print("\n" + "=" * 50) + print("🚀 ANSWER 🚀") + print("=" * 50 + "\n") + print(answer) + print("\n" + "=" * 50 + "\n") \ No newline at end of file diff --git a/Official_Acoount/LazyRAG_MinerU/utils/magic_pdf_reader.py b/Official_Acoount/LazyRAG_MinerU/utils/magic_pdf_reader.py new file mode 100644 index 0000000..966176d --- /dev/null +++ b/Official_Acoount/LazyRAG_MinerU/utils/magic_pdf_reader.py @@ -0,0 +1,332 @@ +import os +import torch +import copy +from pathlib import Path +from bs4 import BeautifulSoup +from typing import Dict, List, Optional, Iterable +from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader +from magic_pdf.data.dataset import PymuDocDataset +from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze +from magic_pdf.config.enums import SupportedPdfParseMethod +import magic_pdf.model as model_config +from magic_pdf.libs import config_reader +from magic_pdf.config.ocr_content_type import BlockType, ContentType +from magic_pdf.libs.commons import join_path +from magic_pdf.libs import config_reader +from magic_pdf.dict2md import ocr_mkcontent +import unicodedata +from lazyllm.tools.rag import DocNode +from lazyllm import LOG +from utils import get_image_path + +# ⚠️ 使用前需配置环境&下载模型,参考 https://github.com/opendatalab/MinerU + +# add patchs to magic-pdf + +# 配置信息,根据magic-pdf版本自行更新,以下配置对应1.0.1版本 +def read_config(): + config = { + "bucket_info": { + "bucket-name-1": ["ak", "sk", "endpoint"], + "bucket-name-2": ["ak", "sk", "endpoint"] + }, + "models-dir": "", # ⚠️下载模型并填入路径 + "layoutreader-model-dir": "", # ⚠️下载模型并填入路径 + "layout-config": { + "model": "doclayout_yolo" + }, + "formula-config": { + "mfd_model": "yolo_v8_mfd", + "mfr_model": "unimernet_small", + "enable": False + }, + "table-config": { + "model": "rapid_table", + "enable": True, + "max_time": 400 + }, + "config_version": "1.0.0" + } + + config["device-mode"] = "cuda" if torch.cuda.is_available() else "cpu" + return config + + +config_reader.read_config = read_config + + +def parse_line_spans(para_block, page_idx): + lines_metas = [] + page = page_idx + if 'lines' in para_block: + for line_info in para_block['lines']: + if not line_info['spans']: + continue + line_meta = copy.deepcopy(line_info['spans'][0]) + line_meta.pop('score', None) + if_cross_page = line_meta.pop('cross_page', None) + line_meta['page'] = page + 1 if if_cross_page == True else page + lines_metas.append(line_meta) + return lines_metas + + +def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason=None): + para_type = para_block['type'] + para_content = {} + + lines_metas = parse_line_spans(para_block, page_idx) + if para_type in [BlockType.Text, BlockType.List, BlockType.Index]: + para_content = { + 'type': 'text', + 'text': ocr_mkcontent.merge_para_with_text(para_block), + } + elif para_type == BlockType.Title: + para_content = { + 'type': 'text', + 'text': ocr_mkcontent.merge_para_with_text(para_block), + 'text_level': 1, + } + elif para_type == BlockType.InterlineEquation: + para_content = { + 'type': 'equation', + 'text': ocr_mkcontent.merge_para_with_text(para_block), + 'text_format': 'latex', + } + elif para_type == BlockType.Image: + para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []} + image_lines_metas = [] + for block in para_block['blocks']: + if block['type'] == BlockType.ImageBody: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.Image: + if span.get('image_path', ''): + para_content['img_path'] = join_path(img_buket_path, span['image_path']) + if block['type'] == BlockType.ImageCaption: + image_lines_metas.extend(parse_line_spans(block, page_idx)) + para_content['img_caption'].append(ocr_mkcontent.merge_para_with_text(block)) + if block['type'] == BlockType.ImageFootnote: + image_lines_metas.extend(parse_line_spans(block, page_idx)) + para_content['img_footnote'].append(ocr_mkcontent.merge_para_with_text(block)) + para_content['lines'] = image_lines_metas + elif para_type == BlockType.Table: + table_lines_metas = [] + para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []} + for block in para_block['blocks']: + if block['type'] == BlockType.TableBody: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.Table: + + if span.get('latex', ''): + para_content['table_body'] = f"\n\n$\n {span['latex']}\n$\n\n" + elif span.get('html', ''): + para_content['table_body'] = f"\n\n{span['html']}\n\n" + + if span.get('image_path', ''): + para_content['img_path'] = join_path(img_buket_path, span['image_path']) + + if block['type'] == BlockType.TableCaption: + table_lines_metas.extend(parse_line_spans(block, page_idx)) + para_content['table_caption'].append(ocr_mkcontent.merge_para_with_text(block)) + if block['type'] == BlockType.TableFootnote: + table_lines_metas.extend(parse_line_spans(block, page_idx)) + para_content['table_footnote'].append(ocr_mkcontent.merge_para_with_text(block)) + para_content['lines'] = table_lines_metas + + para_content['page_idx'] = page_idx + para_content['bbox'] = para_block['bbox'] + if lines_metas: + para_content['lines'] = lines_metas + para_content.pop('lines', []) + + if drop_reason is not None: + para_content['drop_reason'] = drop_reason + + return para_content + + +ocr_mkcontent.para_to_standard_format_v2 = para_to_standard_format_v2 + + +class MagicPDFReader: + """ + PDF 文档解析器,支持文本、图片、表格的解析,返回结构化文档节点。 + """ + + def __init__(self): + self.image_save_path = get_image_path() + + def __call__(self, file: Path, split_documents: Optional[bool] = True, **kwargs) -> List[DocNode]: + return self._load_data(file, split_documents) + + def _load_data(self, file: Path, split_documents: Optional[bool] = True) -> List[DocNode]: + """ + 解析 PDF 并返回结构化文档节点。 + """ + elements = self._parse_pdf_elements(file) + docs = [] + + if split_documents: + for element in elements: + metadata = {"file_name": file.name} + metadata.update({k: v for k, v in element.items() if k != "text"}) + metadata.update({'file_path': str(file)}) + docs.append(DocNode(text=element.get("text", ""), metadata=metadata)) + else: + text_chunks = [el["text"] for el in elements if "text" in el] + docs.append(DocNode(text="\n".join(text_chunks), metadata={"file_name": file.name})) + + LOG.info(f"Successfully parsed {file.name}") + return docs + + def _parse_pdf_elements(self, pdf_path: Path) -> List[dict]: + """ + 解析 PDF 并返回文档元素(文本、表格、图片等)列表。 + """ + image_dir = os.path.basename(self.image_save_path) + os.makedirs(self.image_save_path, exist_ok=True) + + image_writer = FileBasedDataWriter(self.image_save_path) + pdf_bytes = FileBasedDataReader("").read(pdf_path) + + ds = PymuDocDataset(pdf_bytes) + + if ds.classify() == SupportedPdfParseMethod.OCR: + infer_result = ds.apply(doc_analyze, ocr=True) + pipe_result = infer_result.pipe_ocr_mode(image_writer) + else: + infer_result = ds.apply(doc_analyze, ocr=False) + pipe_result = infer_result.pipe_txt_mode(image_writer) + + infer_result.get_infer_res() + return self._extract_content_blocks(pipe_result.get_content_list(image_dir)) + + def _extract_content_blocks(self, content_list) -> List[dict]: + """ + 处理解析结果,提取文本、表格、图片等内容。 + """ + blocks = [] + cur_title = "" + cur_level = -1 + for content in content_list: + block = {} + block["bbox"] = content["bbox"] + block["lines"] = content["lines"] if 'lines' in content else [] + for line in block['lines']: + line['content'] = self._clean_content(line['content']) + if content["type"] == "text": + content["text"] = self._clean_content(content["text"]).strip() + if not content["text"]: + continue + if "text_level" in content: + if cur_title and content["text_level"] > cur_level: + content["title"] = cur_title + cur_title = content["text"] + cur_level = content["text_level"] + else: + if cur_title: + content["title"] = cur_title + block = copy.deepcopy(content) + block["page"] = content["page_idx"] + del block["page_idx"] + blocks.append(block) + elif content["type"] == "image": + if not content["img_path"]: + continue + block["type"] = content["type"] + block["page"] = content["page_idx"] + block["image_path"] = os.path.basename(content["img_path"]) + block['img_caption'] = self._clean_content(content['img_caption']) + block['img_footnote'] = self._clean_content(content['img_footnote']) + if cur_title: + block["title"] = cur_title + blocks.append(block) + elif content["type"] == "table": + block["type"] = content["type"] + block["page"] = content["page_idx"] + block["text"] = self._html_table_to_markdown(self._clean_content(content["table_body"])) if "table_body" in content else "" + if cur_title: + block["title"] = cur_title + block['table_caption'] = self._clean_content(content['table_caption']) + block['table_footnote'] = self._clean_content(content['table_footnote']) + blocks.append(block) + return blocks + + def _clean_content(self, content) -> str: + """ + 清理文本内容,处理编码问题并进行 Unicode 归一化。 + """ + if isinstance(content, str): + content = content.encode("utf-8", "replace").decode("utf-8") + return unicodedata.normalize("NFKC", content) + if isinstance(content, list): + return [self._clean_content(t) for t in content] + return content + + def _html_table_to_markdown(self, html_table) -> str: + """ + 将 HTML 表格转换为 Markdown 格式。 + """ + try: + soup = BeautifulSoup(html_table.strip(), 'html.parser') + table = soup.find('table') + if not table: + raise ValueError("No