v0.1.0: CRM/ERP 系统内测版本 - 安全加固完成
- Docker bridge 网络隔离(8000 端口封死) - Gunicorn 4 Worker 多进程 - Alembic 数据库迁移基线 - 日志轮转 20m×3 - JWT 密钥 + DB 密码 + CORS 收紧 - 3-2-1 备份链路(NAS + R740-B 冷备) - 连接池 pool_pre_ping + pool_recycle=3600
This commit is contained in:
@@ -0,0 +1,220 @@
|
||||
"""
|
||||
OCR 服务 — 基于 3090 节点 Qwen3.5-27B (Vision)
|
||||
对发票/名片图片做 AI 视觉理解,提取结构化数据。
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
import re
|
||||
import httpx
|
||||
from app.core.config import settings
|
||||
|
||||
INVOICE_PROMPT = """你是一个专业的发票OCR解析器。请分析图片中的发票/票据,提取以下结构化信息,以 JSON 格式返回:
|
||||
|
||||
{
|
||||
"merchant": "开票方/销售方名称",
|
||||
"amount": 金额数字(不带货币符号),
|
||||
"date": "YYYY-MM-DD 格式的开票日期",
|
||||
"invoice_code": "发票代码(如有)",
|
||||
"invoice_number": "发票号码(如有)",
|
||||
"tax_rate": "税率(如有)",
|
||||
"tax_amount": 税额数字(如有),
|
||||
"items": "发票上的商品/服务名称",
|
||||
"buyer": "购买方/抬头(如有)",
|
||||
"remark": "备注信息(如有)"
|
||||
}
|
||||
|
||||
只输出 JSON,不需要解释。如果某个字段无法识别,设为 null。"""
|
||||
|
||||
BUSINESS_CARD_PROMPT = """你是一个名片OCR解析器。请分析图片中的名片,提取以下信息并以 JSON 返回:
|
||||
|
||||
{
|
||||
"name": "姓名",
|
||||
"company": "公司名称",
|
||||
"title": "职位",
|
||||
"phone": "电话号码",
|
||||
"email": "邮箱",
|
||||
"address": "地址",
|
||||
"other": "其他信息"
|
||||
}
|
||||
|
||||
只输出 JSON。无法识别的字段设为 null。"""
|
||||
|
||||
|
||||
async def ocr_image(
|
||||
image_base64: str,
|
||||
scene: str = "invoice",
|
||||
) -> dict:
|
||||
"""
|
||||
调用 3090 Qwen-VL 对图片做视觉理解/OCR。
|
||||
|
||||
Args:
|
||||
image_base64: base64 编码的图片数据
|
||||
scene: "invoice" | "business_card" | "general"
|
||||
|
||||
Returns:
|
||||
{"success": True, "data": {...提取的结构化数据...}}
|
||||
"""
|
||||
fallback = {"success": False, "data": {}, "error": "OCR 服务不可用"}
|
||||
|
||||
if not settings.OLLAMA_3090_BASE_URL:
|
||||
return fallback
|
||||
|
||||
prompt = INVOICE_PROMPT if scene == "invoice" else (
|
||||
BUSINESS_CARD_PROMPT if scene == "business_card" else
|
||||
"请详细描述图片中的所有文字内容,以 JSON 格式输出。"
|
||||
)
|
||||
|
||||
url = f"{settings.OLLAMA_3090_BASE_URL}/api/chat"
|
||||
payload = {
|
||||
"model": settings.OLLAMA_3090_MODEL,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "/no_think\n" + prompt,
|
||||
"images": [image_base64], # Ollama vision 格式
|
||||
},
|
||||
],
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.1,
|
||||
"num_predict": 2000,
|
||||
},
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
resp = await client.post(url, json=payload)
|
||||
if resp.status_code != 200:
|
||||
print(f"[OCR] 3090 返回 {resp.status_code}: {resp.text[:200]}")
|
||||
return {"success": False, "data": {}, "error": f"VL 模型返回 {resp.status_code}"}
|
||||
|
||||
data = resp.json()
|
||||
# Qwen3.5 的 CoT 推理放在 message.thinking,最终结果在 message.content
|
||||
content = data.get("message", {}).get("content", "")
|
||||
thinking = data.get("message", {}).get("thinking", "")
|
||||
|
||||
# 优先从 content 提取 JSON,回退到 thinking
|
||||
for text_source in [content, thinking]:
|
||||
if not text_source:
|
||||
continue
|
||||
cleaned = re.sub(r'<think>.*?</think>', '', text_source, flags=re.DOTALL).strip()
|
||||
json_match = re.search(r'\{[\s\S]*\}', cleaned)
|
||||
if json_match:
|
||||
try:
|
||||
result = json.loads(json_match.group())
|
||||
print(f"[OCR] 解析成功: {list(result.keys())}")
|
||||
return {"success": True, "data": result}
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# 没有提取到 JSON,返回原始文本
|
||||
raw = content or thinking
|
||||
print(f"[OCR] 未能提取 JSON, 内容长度: content={len(content)}, thinking={len(thinking)}")
|
||||
return {"success": True, "data": {"raw_text": raw[:2000]}}
|
||||
|
||||
except httpx.TimeoutException:
|
||||
print("[OCR] 3090 超时(60s)")
|
||||
return {"success": False, "data": {}, "error": "VL 模型响应超时"}
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"[OCR] JSON 解析失败: {e}")
|
||||
return {"success": False, "data": {}, "error": f"JSON 解析失败: {e}"}
|
||||
except Exception as e:
|
||||
print(f"[OCR] 错误: {e}")
|
||||
return {"success": False, "data": {}, "error": str(e)}
|
||||
|
||||
|
||||
TEXT_INVOICE_PROMPT = """你是一个专业的发票数据提取器。以下是一份发票/票据的文本内容(来自 PDF 转换后的 Markdown 或纯文本)。
|
||||
请从中提取以下结构化信息,以 JSON 格式返回:
|
||||
|
||||
{
|
||||
"merchant": "开票方/销售方名称",
|
||||
"amount": 金额数字(不带货币符号),
|
||||
"date": "YYYY-MM-DD 格式的开票日期",
|
||||
"invoice_code": "发票代码(如有)",
|
||||
"invoice_number": "发票号码(如有)",
|
||||
"tax_rate": "税率(如有)",
|
||||
"tax_amount": 税额数字(如有),
|
||||
"items": "发票上的商品/服务名称",
|
||||
"buyer": "购买方/抬头(如有)",
|
||||
"remark": "备注信息(如有)"
|
||||
}
|
||||
|
||||
只输出 JSON,不需要解释。如果某个字段无法识别,设为 null。
|
||||
注意:文本可能是从 PDF 转换而来,格式可能不规整,请智能识别。"""
|
||||
|
||||
|
||||
async def extract_invoice_from_text(
|
||||
text: str,
|
||||
scene: str = "invoice",
|
||||
) -> dict:
|
||||
"""
|
||||
用 LLM 从纯文本(MD/TXT)中提取发票结构化数据。
|
||||
不走视觉模型,纯文本理解,更快更准。
|
||||
"""
|
||||
fallback = {"success": False, "data": {}, "error": "AI 文本提取服务不可用"}
|
||||
|
||||
if not settings.OLLAMA_3090_BASE_URL:
|
||||
return fallback
|
||||
|
||||
prompt = TEXT_INVOICE_PROMPT if scene == "invoice" else (
|
||||
BUSINESS_CARD_PROMPT if scene == "business_card" else
|
||||
"请从以下文本中提取所有关键信息,以 JSON 格式输出。"
|
||||
)
|
||||
|
||||
# 限制文本长度,避免 token 爆炸
|
||||
truncated = text[:8000] if len(text) > 8000 else text
|
||||
|
||||
url = f"{settings.OLLAMA_3090_BASE_URL}/api/chat"
|
||||
payload = {
|
||||
"model": settings.OLLAMA_3090_MODEL,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"/no_think\n{prompt}\n\n--- 以下是发票文本内容 ---\n\n{truncated}",
|
||||
# 不传 images —— 纯文本模式
|
||||
},
|
||||
],
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.1,
|
||||
"num_predict": 2000,
|
||||
},
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
resp = await client.post(url, json=payload)
|
||||
if resp.status_code != 200:
|
||||
print(f"[TextExtract] 3090 返回 {resp.status_code}: {resp.text[:200]}")
|
||||
return {"success": False, "data": {}, "error": f"LLM 返回 {resp.status_code}"}
|
||||
|
||||
data = resp.json()
|
||||
content = data.get("message", {}).get("content", "")
|
||||
thinking = data.get("message", {}).get("thinking", "")
|
||||
|
||||
for text_source in [content, thinking]:
|
||||
if not text_source:
|
||||
continue
|
||||
cleaned = re.sub(r'<think>.*?</think>', '', text_source, flags=re.DOTALL).strip()
|
||||
json_match = re.search(r'\{[\s\S]*\}', cleaned)
|
||||
if json_match:
|
||||
try:
|
||||
result = json.loads(json_match.group())
|
||||
print(f"[TextExtract] AI 提取成功: {list(result.keys())}")
|
||||
return {"success": True, "data": result}
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
raw = content or thinking
|
||||
print(f"[TextExtract] 未能提取 JSON, 内容: {raw[:200]}")
|
||||
return {"success": True, "data": {"raw_text": raw[:2000]}}
|
||||
|
||||
except httpx.TimeoutException:
|
||||
print("[TextExtract] 3090 超时")
|
||||
return {"success": False, "data": {}, "error": "LLM 响应超时"}
|
||||
except Exception as e:
|
||||
print(f"[TextExtract] 错误: {e}")
|
||||
return {"success": False, "data": {}, "error": str(e)}
|
||||
|
||||
Reference in New Issue
Block a user