""" OCR 服务 — 基于 3090 节点 Qwen3.5-27B (Vision) 对发票/名片图片做 AI 视觉理解,提取结构化数据。 """ from __future__ import annotations import base64 import json import re import httpx from app.core.config import settings INVOICE_PROMPT = """你是一个专业的发票OCR解析器。请分析图片中的发票/票据,提取以下结构化信息,以 JSON 格式返回: { "merchant": "开票方/销售方名称", "amount": 金额数字(不带货币符号), "date": "YYYY-MM-DD 格式的开票日期", "invoice_code": "发票代码(如有)", "invoice_number": "发票号码(如有)", "tax_rate": "税率(如有)", "tax_amount": 税额数字(如有), "items": "发票上的商品/服务名称", "buyer": "购买方/抬头(如有)", "remark": "备注信息(如有)" } 只输出 JSON,不需要解释。如果某个字段无法识别,设为 null。""" BUSINESS_CARD_PROMPT = """你是一个名片OCR解析器。请分析图片中的名片,提取以下信息并以 JSON 返回: { "name": "姓名", "company": "公司名称", "title": "职位", "phone": "电话号码", "email": "邮箱", "address": "地址", "other": "其他信息" } 只输出 JSON。无法识别的字段设为 null。""" async def ocr_image( image_base64: str, scene: str = "invoice", ) -> dict: """ 调用 3090 Qwen-VL 对图片做视觉理解/OCR。 Args: image_base64: base64 编码的图片数据 scene: "invoice" | "business_card" | "general" Returns: {"success": True, "data": {...提取的结构化数据...}} """ fallback = {"success": False, "data": {}, "error": "OCR 服务不可用"} if not settings.OLLAMA_3090_BASE_URL: return fallback prompt = INVOICE_PROMPT if scene == "invoice" else ( BUSINESS_CARD_PROMPT if scene == "business_card" else "请详细描述图片中的所有文字内容,以 JSON 格式输出。" ) url = f"{settings.OLLAMA_3090_BASE_URL}/api/chat" payload = { "model": settings.OLLAMA_3090_MODEL, "messages": [ { "role": "user", "content": prompt, "images": [image_base64], # Ollama vision 格式 }, ], "stream": False, "think": False, # 关闭思考模式:稳定输出、避免死循环、提速 2-5x "options": { "temperature": 0.1, "num_predict": 2000, }, } try: async with httpx.AsyncClient(timeout=120.0) as client: resp = await client.post(url, json=payload) if resp.status_code != 200: detail = resp.text[:200] print(f"[OCR] 3090 返回 {resp.status_code}: {detail}") if "model runner" in detail: return {"success": False, "data": {}, "error": "AI OCR 模型进程崩溃,请联系管理员重启 Ollama 服务"} return {"success": False, "data": {}, "error": f"AI OCR 服务异常 (HTTP {resp.status_code}),请稍后重试"} data = resp.json() content = data.get("message", {}).get("content", "") # 关闭思考模式后,结果直接在 content(无 thinking 字段) if content: cleaned = re.sub(r'.*?', '', content, flags=re.DOTALL).strip() json_match = re.search(r'\{[\s\S]*\}', cleaned) if json_match: try: result = json.loads(json_match.group()) print(f"[OCR] 解析成功: {list(result.keys())}") return {"success": True, "data": result} except json.JSONDecodeError: pass print(f"[OCR] 未能提取 JSON, content 长度: {len(content)}") return {"success": True, "data": {"raw_text": content[:2000]}} except httpx.TimeoutException: print("[OCR] 3090 超时(120s)") return {"success": False, "data": {}, "error": "AI OCR 响应超时(120s),模型可能负载过高,请稍后重试"} except json.JSONDecodeError as e: print(f"[OCR] JSON 解析失败: {e}") return {"success": False, "data": {}, "error": f"JSON 解析失败: {e}"} except Exception as e: print(f"[OCR] 错误: {e}") return {"success": False, "data": {}, "error": str(e)} TEXT_INVOICE_PROMPT = """你是一个专业的发票数据提取器。以下是一份发票/票据的文本内容(来自 PDF 转换后的 Markdown 或纯文本)。 请从中提取以下结构化信息,以 JSON 格式返回: { "merchant": "开票方/销售方名称", "amount": 金额数字(不带货币符号), "date": "YYYY-MM-DD 格式的开票日期", "invoice_code": "发票代码(如有)", "invoice_number": "发票号码(如有)", "tax_rate": "税率(如有)", "tax_amount": 税额数字(如有), "items": "发票上的商品/服务名称", "buyer": "购买方/抬头(如有)", "remark": "备注信息(如有)" } 只输出 JSON,不需要解释。如果某个字段无法识别,设为 null。 注意:文本可能是从 PDF 转换而来,格式可能不规整,请智能识别。""" async def extract_invoice_from_text( text: str, scene: str = "invoice", ) -> dict: """ 用 LLM 从纯文本(MD/TXT)中提取发票结构化数据。 不走视觉模型,纯文本理解,更快更准。 """ fallback = {"success": False, "data": {}, "error": "AI 文本提取服务不可用"} if not settings.OLLAMA_3090_BASE_URL: return fallback prompt = TEXT_INVOICE_PROMPT if scene == "invoice" else ( BUSINESS_CARD_PROMPT if scene == "business_card" else "请从以下文本中提取所有关键信息,以 JSON 格式输出。" ) # 限制文本长度,避免 token 爆炸 truncated = text[:8000] if len(text) > 8000 else text url = f"{settings.OLLAMA_3090_BASE_URL}/api/chat" payload = { "model": settings.OLLAMA_3090_MODEL, "messages": [ { "role": "user", "content": f"{prompt}\n\n--- 以下是发票文本内容 ---\n\n{truncated}", }, ], "stream": False, "think": False, # 关闭思考模式 "options": { "temperature": 0.1, "num_predict": 2000, }, } try: async with httpx.AsyncClient(timeout=120.0) as client: resp = await client.post(url, json=payload) if resp.status_code != 200: print(f"[TextExtract] 3090 返回 {resp.status_code}: {resp.text[:200]}") return {"success": False, "data": {}, "error": f"LLM 返回 {resp.status_code}"} data = resp.json() content = data.get("message", {}).get("content", "") if content: cleaned = re.sub(r'.*?', '', content, flags=re.DOTALL).strip() json_match = re.search(r'\{[\s\S]*\}', cleaned) if json_match: try: result = json.loads(json_match.group()) print(f"[TextExtract] AI 提取成功: {list(result.keys())}") return {"success": True, "data": result} except json.JSONDecodeError: pass print(f"[TextExtract] 未能提取 JSON, content: {content[:200]}") return {"success": True, "data": {"raw_text": content[:2000]}} except httpx.TimeoutException: print("[TextExtract] 3090 超时") return {"success": False, "data": {}, "error": "LLM 响应超时"} except Exception as e: print(f"[TextExtract] 错误: {e}") return {"success": False, "data": {}, "error": str(e)}