v0.1.0: CRM/ERP 系统内测版本 - 安全加固完成

- Docker bridge 网络隔离（8000 端口封死） - Gunicorn 4 Worker 多进程 - Alembic 数据库迁移基线 - 日志轮转 20m×3 - JWT 密钥 + DB 密码 + CORS 收紧 - 3-2-1 备份链路（NAS + R740-B 冷备） - 连接池 pool_pre_ping + pool_recycle=3600
2026-03-16 07:31:37 +00:00
commit 423baff73b
2578 changed files with 824643 additions and 0 deletions
@@ -0,0 +1,220 @@
+"""
+OCR 服务 — 基于 3090 节点 Qwen3.5-27B (Vision)
+对发票/名片图片做 AI 视觉理解，提取结构化数据。
+"""
+from __future__ import annotations
+
+import base64
+import json
+import re
+import httpx
+from app.core.config import settings
+
+INVOICE_PROMPT = """你是一个专业的发票OCR解析器。请分析图片中的发票/票据，提取以下结构化信息，以 JSON 格式返回：
+
+{
+  "merchant": "开票方/销售方名称",
+  "amount": 金额数字(不带货币符号),
+  "date": "YYYY-MM-DD 格式的开票日期",
+  "invoice_code": "发票代码(如有)",
+  "invoice_number": "发票号码(如有)",
+  "tax_rate": "税率(如有)",
+  "tax_amount": 税额数字(如有),
+  "items": "发票上的商品/服务名称",
+  "buyer": "购买方/抬头(如有)",
+  "remark": "备注信息(如有)"
+}
+
+只输出 JSON，不需要解释。如果某个字段无法识别，设为 null。"""
+
+BUSINESS_CARD_PROMPT = """你是一个名片OCR解析器。请分析图片中的名片，提取以下信息并以 JSON 返回：
+
+{
+  "name": "姓名",
+  "company": "公司名称",
+  "title": "职位",
+  "phone": "电话号码",
+  "email": "邮箱",
+  "address": "地址",
+  "other": "其他信息"
+}
+
+只输出 JSON。无法识别的字段设为 null。"""
+
+
+async def ocr_image(
+    image_base64: str,
+    scene: str = "invoice",
+) -> dict:
+    """
+    调用 3090 Qwen-VL 对图片做视觉理解/OCR。
+
+    Args:
+        image_base64: base64 编码的图片数据
+        scene: "invoice" | "business_card" | "general"
+
+    Returns:
+        {"success": True, "data": {...提取的结构化数据...}}
+    """
+    fallback = {"success": False, "data": {}, "error": "OCR 服务不可用"}
+
+    if not settings.OLLAMA_3090_BASE_URL:
+        return fallback
+
+    prompt = INVOICE_PROMPT if scene == "invoice" else (
+        BUSINESS_CARD_PROMPT if scene == "business_card" else
+        "请详细描述图片中的所有文字内容，以 JSON 格式输出。"
+    )
+
+    url = f"{settings.OLLAMA_3090_BASE_URL}/api/chat"
+    payload = {
+        "model": settings.OLLAMA_3090_MODEL,
+        "messages": [
+            {
+                "role": "user",
+                "content": "/no_think\n" + prompt,
+                "images": [image_base64],  # Ollama vision 格式
+            },
+        ],
+        "stream": False,
+        "options": {
+            "temperature": 0.1,
+            "num_predict": 2000,
+        },
+    }
+
+    try:
+        async with httpx.AsyncClient(timeout=120.0) as client:
+            resp = await client.post(url, json=payload)
+            if resp.status_code != 200:
+                print(f"[OCR] 3090 返回 {resp.status_code}: {resp.text[:200]}")
+                return {"success": False, "data": {}, "error": f"VL 模型返回 {resp.status_code}"}
+
+            data = resp.json()
+            # Qwen3.5 的 CoT 推理放在 message.thinking，最终结果在 message.content
+            content = data.get("message", {}).get("content", "")
+            thinking = data.get("message", {}).get("thinking", "")
+
+            # 优先从 content 提取 JSON，回退到 thinking
+            for text_source in [content, thinking]:
+                if not text_source:
+                    continue
+                cleaned = re.sub(r'<think>.*?</think>', '', text_source, flags=re.DOTALL).strip()
+                json_match = re.search(r'\{[\s\S]*\}', cleaned)
+                if json_match:
+                    try:
+                        result = json.loads(json_match.group())
+                        print(f"[OCR] 解析成功: {list(result.keys())}")
+                        return {"success": True, "data": result}
+                    except json.JSONDecodeError:
+                        continue
+
+            # 没有提取到 JSON，返回原始文本
+            raw = content or thinking
+            print(f"[OCR] 未能提取 JSON, 内容长度: content={len(content)}, thinking={len(thinking)}")
+            return {"success": True, "data": {"raw_text": raw[:2000]}}
+
+    except httpx.TimeoutException:
+        print("[OCR] 3090 超时（60s）")
+        return {"success": False, "data": {}, "error": "VL 模型响应超时"}
+    except json.JSONDecodeError as e:
+        print(f"[OCR] JSON 解析失败: {e}")
+        return {"success": False, "data": {}, "error": f"JSON 解析失败: {e}"}
+    except Exception as e:
+        print(f"[OCR] 错误: {e}")
+        return {"success": False, "data": {}, "error": str(e)}
+
+
+TEXT_INVOICE_PROMPT = """你是一个专业的发票数据提取器。以下是一份发票/票据的文本内容（来自 PDF 转换后的 Markdown 或纯文本）。
+请从中提取以下结构化信息，以 JSON 格式返回：
+
+{
+  "merchant": "开票方/销售方名称",
+  "amount": 金额数字(不带货币符号),
+  "date": "YYYY-MM-DD 格式的开票日期",
+  "invoice_code": "发票代码(如有)",
+  "invoice_number": "发票号码(如有)",
+  "tax_rate": "税率(如有)",
+  "tax_amount": 税额数字(如有),
+  "items": "发票上的商品/服务名称",
+  "buyer": "购买方/抬头(如有)",
+  "remark": "备注信息(如有)"
+}
+
+只输出 JSON，不需要解释。如果某个字段无法识别，设为 null。
+注意：文本可能是从 PDF 转换而来，格式可能不规整，请智能识别。"""
+
+
+async def extract_invoice_from_text(
+    text: str,
+    scene: str = "invoice",
+) -> dict:
+    """
+    用 LLM 从纯文本（MD/TXT）中提取发票结构化数据。
+    不走视觉模型，纯文本理解，更快更准。
+    """
+    fallback = {"success": False, "data": {}, "error": "AI 文本提取服务不可用"}
+
+    if not settings.OLLAMA_3090_BASE_URL:
+        return fallback
+
+    prompt = TEXT_INVOICE_PROMPT if scene == "invoice" else (
+        BUSINESS_CARD_PROMPT if scene == "business_card" else
+        "请从以下文本中提取所有关键信息，以 JSON 格式输出。"
+    )
+
+    # 限制文本长度，避免 token 爆炸
+    truncated = text[:8000] if len(text) > 8000 else text
+
+    url = f"{settings.OLLAMA_3090_BASE_URL}/api/chat"
+    payload = {
+        "model": settings.OLLAMA_3090_MODEL,
+        "messages": [
+            {
+                "role": "user",
+                "content": f"/no_think\n{prompt}\n\n--- 以下是发票文本内容 ---\n\n{truncated}",
+                # 不传 images —— 纯文本模式
+            },
+        ],
+        "stream": False,
+        "options": {
+            "temperature": 0.1,
+            "num_predict": 2000,
+        },
+    }
+
+    try:
+        async with httpx.AsyncClient(timeout=120.0) as client:
+            resp = await client.post(url, json=payload)
+            if resp.status_code != 200:
+                print(f"[TextExtract] 3090 返回 {resp.status_code}: {resp.text[:200]}")
+                return {"success": False, "data": {}, "error": f"LLM 返回 {resp.status_code}"}
+
+            data = resp.json()
+            content = data.get("message", {}).get("content", "")
+            thinking = data.get("message", {}).get("thinking", "")
+
+            for text_source in [content, thinking]:
+                if not text_source:
+                    continue
+                cleaned = re.sub(r'<think>.*?</think>', '', text_source, flags=re.DOTALL).strip()
+                json_match = re.search(r'\{[\s\S]*\}', cleaned)
+                if json_match:
+                    try:
+                        result = json.loads(json_match.group())
+                        print(f"[TextExtract] AI 提取成功: {list(result.keys())}")
+                        return {"success": True, "data": result}
+                    except json.JSONDecodeError:
+                        continue
+
+            raw = content or thinking
+            print(f"[TextExtract] 未能提取 JSON, 内容: {raw[:200]}")
+            return {"success": True, "data": {"raw_text": raw[:2000]}}
+
+    except httpx.TimeoutException:
+        print("[TextExtract] 3090 超时")
+        return {"success": False, "data": {}, "error": "LLM 响应超时"}
+    except Exception as e:
+        print(f"[TextExtract] 错误: {e}")
+        return {"success": False, "data": {}, "error": str(e)}
+