乌班图 部署 Mineru 本地解析

乌班图 部署 Mineru 本地解析

1.1 完整树形图文件目录

deploy/ └── roof-mineru/ # MinerU OCR 服务部署根目录 ├── roof_mineru/ # Python 服务包 │ ├── __init__.py # 包初始化 │ ├── app.py # FastAPI 入口,提供 /parse /task /result /health │ ├── config.py # 环境变量、目录、日志、默认参数 │ ├── file_utils.py # 文件名清理、Markdown 读取、过期目录清理 │ ├── mineru_runner.py # MinerU CLI 调用与输出解析 │ ├── schemas.py # API 响应与任务模型 │ └── task_store.py # 任务内存状态与 result.json 持久化 ├── docker-compose.yml # Docker Compose 配置(默认 GPU 版) ├── Dockerfile.cpu # CPU/通用镜像方案 ├── Dockerfile.gpu # GPU/CUDA 11.8 镜像方案 ├── requirements.txt # Python 依赖列表 ├── start.sh # 容器启动脚本 ├── uploads/ # 上传文件临时目录(挂载) ├── output/ # MinerU 输出目录(挂载) ├── logs/ # 服务日志目录(挂载) ├── models/ # 模型与框架缓存持久化目录(挂载) └── cache/ # /root/.cache 持久化目录(挂载)

1.2 文件说明

文件说明
roof_mineru/app.pyFastAPI 入口文件,定义 REST API 接口
roof_mineru/config.py配置类,读取环境变量,管理目录和参数
roof_mineru/mineru_runner.py调用 MinerU CLI 命令执行文档解析
roof_mineru/task_store.py任务状态管理,支持内存和文件持久化
docker-compose.ymlDocker Compose 部署配置,默认使用 GPU 镜像
Dockerfile.gpuGPU 版镜像构建文件,基于 CUDA 11.8
Dockerfile.cpuCPU 版镜像构建文件,基于 Python 3.11
start.sh容器启动脚本,包含 GPU 检测和服务启动
requirements.txtPython 依赖:fastapi、uvicorn、mineru、paddleocr

1.3 文件内容

requirements.txt

fastapi==0.115.6 uvicorn[standard]==0.34.0 python-multipart==0.0.20 pydantic==2.10.5 # MinerU 官方包。包含表格识别功能 mineru[pipeline]==2.1.11 # 表格图片 OCR 识别(与 paddlepaddle-gpu 2.7 兼容) paddleocr==2.9.1

docker-compose.yml

services: roof-mineru: build: context: . dockerfile: Dockerfile.gpu image: roof-mineru:gpu container_name: roof-mineru restart: unless-stopped deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [ gpu ] shm_size: "4gb" ports: - "8001:8001" environment: TZ: Asia/Shanghai NVIDIA_VISIBLE_DEVICES: all NVIDIA_DRIVER_CAPABILITIES: compute,utility LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs MINERU_MODEL_SOURCE: modelscope TRANSFORMERS_OFFLINE: 1 HF_HUB_OFFLINE: 1 MODEL_SCOPE_API: https://www.modelscope.cn HF_ENDPOINT: https://hf-mirror.com MINERU_HOST: 0.0.0.0 MINERU_PORT: 8001 MINERU_UPLOAD_DIR: /app/uploads MINERU_OUTPUT_DIR: /app/output MINERU_LOG_DIR: /app/logs MODELSCOPE_CACHE: /app/models/modelscope HF_HOME: /app/models/huggingface TRANSFORMERS_CACHE: /app/models/huggingface/transformers PADDLE_HOME: /app/models/paddle MINERU_DEFAULT_LANG: ch MINERU_DEFAULT_BACKEND: pipeline MINERU_DEFAULT_PARSE_METHOD: auto MINERU_MAX_UPLOAD_MB: 500 MINERU_MAX_WORKERS: 2 MINERU_TASK_TTL_HOURS: 24 volumes: - ./uploads:/app/uploads - ./output:/app/output - ./logs:/app/logs - ./models:/app/models - ./cache/pip:/root/.cache/pip - ./cache/huggingface:/root/.cache/huggingface - ./roof_mineru:/app/roof_mineru - ./start.sh:/app/start.sh healthcheck: test: ["CMD", "curl", "-f", "http://127.0.0.1:8001/health"] interval: 30s timeout: 10s retries: 3 start_period: 120s logging: driver: json-file options: max-size: "100m" max-file: "10" ulimits: nofile: soft: 65535 hard: 65535

start.sh

#!/usr/bin/env sh set -eu : "${MINERU_HOST:=0.0.0.0}" : "${MINERU_PORT:=8001}" : "${MINERU_UPLOAD_DIR:=/app/uploads}" : "${MINERU_OUTPUT_DIR:=/app/output}" : "${MINERU_LOG_DIR:=/app/logs}" mkdir -p "$MINERU_UPLOAD_DIR" "$MINERU_OUTPUT_DIR" "$MINERU_LOG_DIR" echo "=== GPU 检测 ===" if nvidia-smi >/dev/null 2>&1; then echo "NVIDIA GPU 可用" nvidia-smi --query-gpu=name,memory.total --format=csv,noheader | head -1 echo "测试 Paddle 能否使用 GPU..." if python3 -c "import paddle; paddle.set_device('gpu'); print('Paddle GPU 可用')" >/dev/null 2>&1; then echo "Paddle GPU 可用,使用 GPU 模式" else echo "Paddle GPU 初始化警告(cuDNN),但继续尝试使用 GPU" fi else echo "NVIDIA GPU 不可用,使用 CPU 模式" export CUDA_VISIBLE_DEVICES="" fi echo "" echo "Starting roof-mineru" echo " host=$MINERU_HOST" echo " port=$MINERU_PORT" echo " upload_dir=$MINERU_UPLOAD_DIR" echo " output_dir=$MINERU_OUTPUT_DIR" echo " log_dir=$MINERU_LOG_DIR" echo " backend=${MINERU_DEFAULT_BACKEND:-pipeline}" echo " parse_method=${MINERU_DEFAULT_PARSE_METHOD:-auto}" exec uvicorn roof_mineru.app:app --host "$MINERU_HOST" --port "$MINERU_PORT"

Dockerfile.gpu

# syntax=docker/dockerfile:1 FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 ENV DEBIAN_FRONTEND=noninteractive \ PYTHONUNBUFFERED=1 \ TZ=Asia/Shanghai \ PIP_INDEX_URL=https://mirrors.aliyun.com/pypi/simple/ \ PIP_TRUSTED_HOST=mirrors.aliyun.com WORKDIR /app # 系统依赖(阿里云 apt 镜像 + BuildKit 缓存) RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \ sed -i 's|http://archive.ubuntu.com|https://mirrors.aliyun.com|g; \ s|http://security.ubuntu.com|https://mirrors.aliyun.com|g' /etc/apt/sources.list \ && apt-get update \ && apt-get install -y --no-install-recommends \ python3 python3-pip python3-dev \ curl wget git tzdata \ libglib2.0-0 libgl1 libgomp1 \ libsm6 libxext6 libxrender1 \ libreoffice-writer libreoffice-core \ fonts-noto-cjk \ fonts-wqy-microhei \ fonts-wqy-zenhei \ && ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \ && echo Asia/Shanghai > /etc/timezone # PyTorch (CUDA 11.8 版本) RUN --mount=type=cache,target=/root/.cache/pip \ pip3 install --upgrade pip setuptools wheel \ && pip3 install \ torch torchvision torchaudio \ --index-url https://download.pytorch.org/whl/cu118 # PaddlePaddle GPU (2.6.2 版本,与 CUDA 11.8 兼容) RUN --mount=type=cache,target=/root/.cache/pip \ pip3 install \ paddlepaddle-gpu==2.6.2 \ -i https://mirrors.aliyun.com/pypi/simple/ \ -f https://www.paddlepaddle.org.cn/whl/linux/gpu/cuda118.html # MinerU + 其他依赖(一次安装减少层数) COPY requirements.txt /app/ RUN --mount=type=cache,target=/root/.cache/pip \ pip3 install \ "mineru[pipeline]" \ opencv-python \ pillow numpy shapely scikit-image \ && pip3 install -r /app/requirements.txt COPY roof_mineru /app/roof_mineru COPY start.sh /app/start.sh RUN chmod +x /app/start.sh \ && mkdir -p /app/uploads /app/output /app/logs /app/models /root/.cache CMD ["/bin/bash", "/app/start.sh"]