Skip to content

Commit 8a20102

Browse files
awwaawwaByaidu
andauthored
feat (docker): onnx model and font embedded (Byaidu#276)
* 将onnx模型和字体打包进docker镜像 * 调整docker构建顺序,先下模型后装包,并将预热更改为dockerfile RUN指令 * 移除无用import * 正确处理authorized为None * translate函数中更好的处理envs和prompt不存在的情况 * 移除docker镜像中无用文件 --------- Co-authored-by: Byaidu <[email protected]>
1 parent 9bd873a commit 8a20102

File tree

5 files changed

+191
-8
lines changed

5 files changed

+191
-8
lines changed

.dockerignore

+175
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
.github
2+
docs
3+
.git
4+
.pre-commit-config.yaml
5+
uv.lock
6+
pdf2zh_files
7+
gui/pdf2zh_files
8+
gradio_files
9+
tmp
10+
gui/gradio_files
11+
gui/tmp
12+
# Byte-compiled / optimized / DLL files
13+
__pycache__/
14+
*.py[cod]
15+
*$py.class
16+
17+
# C extensions
18+
*.so
19+
20+
# Distribution / packaging
21+
.Python
22+
build/
23+
develop-eggs/
24+
dist/
25+
downloads/
26+
eggs/
27+
.eggs/
28+
lib/
29+
lib64/
30+
parts/
31+
sdist/
32+
var/
33+
wheels/
34+
share/python-wheels/
35+
*.egg-info/
36+
.installed.cfg
37+
*.egg
38+
MANIFEST
39+
40+
# PyInstaller
41+
# Usually these files are written by a python script from a template
42+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
43+
*.manifest
44+
*.spec
45+
46+
# Installer logs
47+
pip-log.txt
48+
pip-delete-this-directory.txt
49+
50+
# Unit test / coverage reports
51+
htmlcov/
52+
.tox/
53+
.nox/
54+
.coverage
55+
.coverage.*
56+
.cache
57+
nosetests.xml
58+
coverage.xml
59+
*.cover
60+
*.py,cover
61+
.hypothesis/
62+
.pytest_cache/
63+
cover/
64+
65+
# Translations
66+
*.mo
67+
*.pot
68+
69+
# Django stuff:
70+
*.log
71+
local_settings.py
72+
db.sqlite3
73+
db.sqlite3-journal
74+
75+
# Flask stuff:
76+
instance/
77+
.webassets-cache
78+
79+
# Scrapy stuff:
80+
.scrapy
81+
82+
# Sphinx documentation
83+
docs/_build/
84+
85+
# PyBuilder
86+
.pybuilder/
87+
target/
88+
89+
# Jupyter Notebook
90+
.ipynb_checkpoints
91+
92+
# IPython
93+
profile_default/
94+
ipython_config.py
95+
96+
# pyenv
97+
# For a library or package, you might want to ignore these files since the code is
98+
# intended to run in multiple environments; otherwise, check them in:
99+
# .python-version
100+
101+
# pipenv
102+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
103+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
104+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
105+
# install all needed dependencies.
106+
#Pipfile.lock
107+
108+
# poetry
109+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
110+
# This is especially recommended for binary packages to ensure reproducibility, and is more
111+
# commonly ignored for libraries.
112+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
113+
#poetry.lock
114+
115+
# pdm
116+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
117+
#pdm.lock
118+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
119+
# in version control.
120+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
121+
.pdm.toml
122+
.pdm-python
123+
.pdm-build/
124+
125+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
126+
__pypackages__/
127+
128+
# Celery stuff
129+
celerybeat-schedule
130+
celerybeat.pid
131+
132+
# SageMath parsed files
133+
*.sage.py
134+
135+
# Environments
136+
.env
137+
.venv
138+
env/
139+
venv/
140+
ENV/
141+
env.bak/
142+
venv.bak/
143+
144+
# Spyder project settings
145+
.spyderproject
146+
.spyproject
147+
148+
# Rope project settings
149+
.ropeproject
150+
151+
# mkdocs documentation
152+
/site
153+
154+
# mypy
155+
.mypy_cache/
156+
.dmypy.json
157+
dmypy.json
158+
159+
# Pyre type checker
160+
.pyre/
161+
162+
# pytype static type analyzer
163+
.pytype/
164+
165+
# Cython debug symbols
166+
cython_debug/
167+
168+
# PyCharm
169+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
170+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
171+
# and can be added to the global gitignore or merged into this file. For a more nuclear
172+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
173+
.idea/
174+
.vscode
175+
.DS_Store

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ cython_debug/
165165
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166166
# and can be added to the global gitignore or merged into this file. For a more nuclear
167167
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
168-
#.idea/
168+
.idea/
169169
.vscode
170170
.DS_Store
171+
uv.lock

Dockerfile

+8-4
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
1-
FROM python:3.12
1+
FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
22

33
WORKDIR /app
44

5-
COPY . .
65

76
EXPOSE 7860
87

98
ENV PYTHONUNBUFFERED=1
9+
ADD "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf" /app
10+
RUN apt-get update && \
11+
apt-get install --no-install-recommends -y libgl1 && \
12+
rm -rf /var/lib/apt/lists/* && uv pip install --system --no-cache huggingface-hub && \
13+
python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('wybxc/DocLayout-YOLO-DocStructBench-onnx','doclayout_yolo_docstructbench_imgsz1024.onnx');"
1014

11-
RUN apt-get update && apt-get install -y libgl1
15+
COPY . .
1216

13-
RUN pip install .
17+
RUN uv pip install --system --no-cache .
1418

1519
CMD ["pdf2zh", "-i"]

pdf2zh/gui.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ def on_select_filetype(file_type):
445445
def readuserandpasswd(file_path):
446446
tuple_list = []
447447
content = ""
448-
if file_path is None or len(file_path) == 0:
448+
if file_path is None:
449449
return tuple_list, content
450450
if len(file_path) == 2:
451451
try:

pdf2zh/high_level.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,10 @@ def translate_stream(
187187
font_list.append((resfont, None))
188188
elif lang_out.lower() in noto_list: # noto
189189
resfont = "noto"
190-
ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
190+
# docker
191+
ttf_path = '/app/GoNotoKurrent-Regular.ttf'
192+
if not os.path.exists(ttf_path):
193+
ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
191194
if not os.path.exists(ttf_path):
192195
print("Downloading Noto font...")
193196
urllib.request.urlretrieve(
@@ -294,7 +297,7 @@ def translate(
294297
doc_raw = open(file, "rb")
295298
s_raw = doc_raw.read()
296299
s_mono, s_dual = translate_stream(
297-
s_raw, envs=kwarg.get("envs"), prompt=kwarg["prompt"], **locals()
300+
s_raw, envs=kwarg.get("envs", {}), prompt=kwarg.get("prompt", []), **locals()
298301
)
299302
file_mono = Path(output) / f"{filename}-mono.pdf"
300303
file_dual = Path(output) / f"{filename}-dual.pdf"

0 commit comments

Comments
 (0)