Skip to content

Commit b0dbd71

Browse files
Parallelize tests (#4024)
1 parent 531490d commit b0dbd71

File tree

6 files changed

+92
-54
lines changed

6 files changed

+92
-54
lines changed

Makefile

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
142142
.PHONY: test
143143
test:
144144
PYTHONPATH=. CI=$(CI) \
145-
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
145+
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest -n auto test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
146146

147147
.PHONY: test-unstructured-api-unit
148148
test-unstructured-api-unit:
@@ -151,60 +151,60 @@ test-unstructured-api-unit:
151151
.PHONY: test-no-extras
152152
test-no-extras:
153153
PYTHONPATH=. CI=$(CI) \
154-
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest \
154+
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest -n auto \
155155
test_${PACKAGE_NAME}/partition/test_text.py \
156156
test_${PACKAGE_NAME}/partition/test_email.py \
157157
test_${PACKAGE_NAME}/partition/html/test_partition.py \
158158
test_${PACKAGE_NAME}/partition/test_xml.py
159159

160160
.PHONY: test-extra-csv
161161
test-extra-csv:
162-
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
162+
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
163163
test_unstructured/partition/test_csv.py \
164164
test_unstructured/partition/test_tsv.py
165165

166166
.PHONY: test-extra-docx
167167
test-extra-docx:
168-
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
168+
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
169169
test_unstructured/partition/test_doc.py \
170170
test_unstructured/partition/test_docx.py
171171

172172
.PHONY: test-extra-epub
173173
test-extra-epub:
174-
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_epub.py
174+
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_epub.py
175175

176176
.PHONY: test-extra-markdown
177177
test-extra-markdown:
178-
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_md.py
178+
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_md.py
179179

180180
.PHONY: test-extra-odt
181181
test-extra-odt:
182-
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_odt.py
182+
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_odt.py
183183

184184
.PHONY: test-extra-pdf-image
185185
test-extra-pdf-image:
186-
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/pdf_image
186+
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/pdf_image
187187

188188
.PHONY: test-extra-pptx
189189
test-extra-pptx:
190-
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
190+
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
191191
test_unstructured/partition/test_ppt.py \
192192
test_unstructured/partition/test_pptx.py
193193

194194
.PHONY: test-extra-pypandoc
195195
test-extra-pypandoc:
196-
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
196+
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
197197
test_unstructured/partition/test_org.py \
198198
test_unstructured/partition/test_rst.py \
199199
test_unstructured/partition/test_rtf.py
200200

201201
.PHONY: test-extra-xlsx
202202
test-extra-xlsx:
203-
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_xlsx.py
203+
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_xlsx.py
204204

205205
.PHONY: test-text-extraction-evaluate
206206
test-text-extraction-evaluate:
207-
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/metrics/test_text_extraction.py
207+
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/metrics/test_text_extraction.py
208208

209209
## check: runs linters (includes tests)
210210
.PHONY: check

requirements/test.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ mypy
1010
pydantic
1111
pytest-cov
1212
pytest-mock
13+
pytest-xdist
1314
ruff
1415
types-Markdown
1516
types-requests

requirements/test.txt

Lines changed: 33 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,54 +2,56 @@
22
# This file is autogenerated by pip-compile with Python 3.10
33
# by the following command:
44
#
5-
# pip-compile ./test.in
5+
# pip-compile requirements/test.in
66
#
77
annotated-types==0.7.0
88
# via pydantic
99
autoflake==2.3.1
10-
# via -r ./test.in
10+
# via -r requirements/test.in
1111
black==25.1.0
12-
# via -r ./test.in
12+
# via -r requirements/test.in
1313
click==8.2.1
1414
# via
15-
# -c ./base.txt
15+
# -c requirements/base.txt
1616
# black
1717
coverage[toml]==7.9.0
1818
# via
19-
# -r ./test.in
19+
# -r requirements/test.in
2020
# pytest-cov
2121
exceptiongroup==1.3.0
2222
# via
23-
# -c ./base.txt
23+
# -c requirements/base.txt
2424
# pytest
25+
execnet==2.1.1
26+
# via pytest-xdist
2527
flake8==7.2.0
2628
# via
27-
# -r ./test.in
29+
# -r requirements/test.in
2830
# flake8-print
2931
flake8-print==5.0.0
30-
# via -r ./test.in
32+
# via -r requirements/test.in
3133
freezegun==1.5.2
32-
# via -r ./test.in
34+
# via -r requirements/test.in
3335
grpcio==1.73.0
3436
# via
35-
# -c ././deps/constraints.txt
36-
# -r ./test.in
37+
# -c requirements/./deps/constraints.txt
38+
# -r requirements/test.in
3739
iniconfig==2.1.0
3840
# via pytest
3941
liccheck==0.9.2
40-
# via -r ./test.in
42+
# via -r requirements/test.in
4143
mccabe==0.7.0
4244
# via flake8
4345
mypy==1.16.0
44-
# via -r ./test.in
46+
# via -r requirements/test.in
4547
mypy-extensions==1.1.0
4648
# via
47-
# -c ./base.txt
49+
# -c requirements/base.txt
4850
# black
4951
# mypy
5052
packaging==25.0
5153
# via
52-
# -c ./base.txt
54+
# -c requirements/base.txt
5355
# black
5456
# pytest
5557
pathspec==0.12.1
@@ -67,7 +69,7 @@ pycodestyle==2.13.0
6769
# flake8
6870
# flake8-print
6971
pydantic==2.11.5
70-
# via -r ./test.in
72+
# via -r requirements/test.in
7173
pydantic-core==2.33.2
7274
# via pydantic
7375
pyflakes==3.3.2
@@ -80,21 +82,24 @@ pytest==8.4.0
8082
# via
8183
# pytest-cov
8284
# pytest-mock
85+
# pytest-xdist
8386
pytest-cov==6.2.1
84-
# via -r ./test.in
87+
# via -r requirements/test.in
8588
pytest-mock==3.14.1
86-
# via -r ./test.in
89+
# via -r requirements/test.in
90+
pytest-xdist==3.7.0
91+
# via -r requirements/test.in
8792
python-dateutil==2.9.0.post0
8893
# via
89-
# -c ./base.txt
94+
# -c requirements/base.txt
9095
# freezegun
9196
ruff==0.11.13
92-
# via -r ./test.in
97+
# via -r requirements/test.in
9398
semantic-version==2.10.0
9499
# via liccheck
95100
six==1.17.0
96101
# via
97-
# -c ./base.txt
102+
# -c requirements/base.txt
98103
# python-dateutil
99104
toml==0.10.2
100105
# via liccheck
@@ -106,16 +111,16 @@ tomli==2.2.1
106111
# mypy
107112
# pytest
108113
types-click==7.1.8
109-
# via -r ./test.in
114+
# via -r requirements/test.in
110115
types-markdown==3.8.0.20250415
111-
# via -r ./test.in
116+
# via -r requirements/test.in
112117
types-requests==2.32.4.20250611
113-
# via -r ./test.in
118+
# via -r requirements/test.in
114119
types-tabulate==0.9.0.20241207
115-
# via -r ./test.in
120+
# via -r requirements/test.in
116121
typing-extensions==4.14.0
117122
# via
118-
# -c ./base.txt
123+
# -c requirements/base.txt
119124
# black
120125
# exceptiongroup
121126
# mypy
@@ -126,6 +131,6 @@ typing-inspection==0.4.1
126131
# via pydantic
127132
urllib3==2.4.0
128133
# via
129-
# -c ././deps/constraints.txt
130-
# -c ./base.txt
134+
# -c requirements/./deps/constraints.txt
135+
# -c requirements/base.txt
131136
# types-requests
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import pytest
2+
3+
from unstructured.partition.utils.constants import OCR_AGENT_PADDLE, OCR_AGENT_TESSERACT
4+
5+
6+
@pytest.fixture
7+
def mock_ocr_get_instance(mocker):
8+
"""Fixture that mocks OCRAgent.get_instance to prevent real OCR agent instantiation."""
9+
10+
def mock_get_instance(ocr_agent_module, language):
11+
if ocr_agent_module in (OCR_AGENT_TESSERACT, OCR_AGENT_PADDLE):
12+
return mocker.MagicMock()
13+
else:
14+
raise ValueError(f"Unknown OCR agent: {ocr_agent_module}")
15+
16+
from unstructured.partition.pdf_image.ocr import OCRAgent
17+
18+
return mocker.patch.object(OCRAgent, "get_instance", side_effect=mock_get_instance)

test_unstructured/partition/pdf_image/test_ocr.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -622,11 +622,10 @@ def mock_page(mock_ocr_layout, mock_layout):
622622
return mock_page
623623

624624

625-
def test_supplement_layout_with_ocr(mocker, mock_page):
625+
def test_supplement_layout_with_ocr(mock_ocr_get_instance, mocker, mock_page):
626626
from unstructured.partition.pdf_image.ocr import OCRAgent
627627

628628
mocker.patch.object(OCRAgent, "get_layout_from_image", return_value=mock_ocr_layout)
629-
spy = mocker.spy(OCRAgent, "get_instance")
630629

631630
ocr.supplement_page_layout_with_ocr(
632631
mock_page,
@@ -637,16 +636,21 @@ def test_supplement_layout_with_ocr(mocker, mock_page):
637636
table_ocr_agent=OCR_AGENT_PADDLE,
638637
)
639638

640-
assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT}
641-
assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE}
639+
assert mock_ocr_get_instance.call_args_list[0][1] == {
640+
"language": "eng",
641+
"ocr_agent_module": OCR_AGENT_TESSERACT,
642+
}
643+
assert mock_ocr_get_instance.call_args_list[1][1] == {
644+
"language": "en",
645+
"ocr_agent_module": OCR_AGENT_PADDLE,
646+
}
642647

643648

644-
def test_pass_down_agents(mocker, mock_page):
649+
def test_pass_down_agents(mock_ocr_get_instance, mocker, mock_page):
645650
from unstructured.partition.pdf_image.ocr import OCRAgent, PILImage
646651

647652
mocker.patch.object(OCRAgent, "get_layout_from_image", return_value=mock_ocr_layout)
648653
mocker.patch.object(PILImage, "open", return_value=Image.new("RGB", (100, 100)))
649-
spy = mocker.spy(OCRAgent, "get_instance")
650654
doc = MagicMock(DocumentLayout)
651655
doc.pages = [mock_page]
652656

@@ -661,5 +665,11 @@ def test_pass_down_agents(mocker, mock_page):
661665
table_ocr_agent=OCR_AGENT_TESSERACT,
662666
)
663667

664-
assert spy.call_args_list[0][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE}
665-
assert spy.call_args_list[1][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT}
668+
assert mock_ocr_get_instance.call_args_list[0][1] == {
669+
"language": "en",
670+
"ocr_agent_module": OCR_AGENT_PADDLE,
671+
}
672+
assert mock_ocr_get_instance.call_args_list[1][1] == {
673+
"language": "eng",
674+
"ocr_agent_module": OCR_AGENT_TESSERACT,
675+
}

test_unstructured/partition/pdf_image/test_pdf.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1588,10 +1588,7 @@ def _test(result):
15881588
_test(result)
15891589

15901590

1591-
def test_partition_pdf_with_specified_ocr_agents(mocker):
1592-
from unstructured.partition.pdf_image.ocr import OCRAgent
1593-
1594-
spy = mocker.spy(OCRAgent, "get_instance")
1591+
def test_partition_pdf_with_specified_ocr_agents(mock_ocr_get_instance, mocker):
15951592

15961593
pdf.partition_pdf(
15971594
filename=example_doc_path("pdf/layout-parser-paper-with-table.pdf"),
@@ -1601,8 +1598,15 @@ def test_partition_pdf_with_specified_ocr_agents(mocker):
16011598
table_ocr_agent=OCR_AGENT_PADDLE,
16021599
)
16031600

1604-
assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT}
1605-
assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE}
1601+
# Verify get_instance was called with correct parameters
1602+
assert mock_ocr_get_instance.call_args_list[0][1] == {
1603+
"language": "eng",
1604+
"ocr_agent_module": OCR_AGENT_TESSERACT,
1605+
}
1606+
assert mock_ocr_get_instance.call_args_list[1][1] == {
1607+
"language": "en",
1608+
"ocr_agent_module": OCR_AGENT_PADDLE,
1609+
}
16061610

16071611

16081612
def test_reproductible_pdf_loader():

0 commit comments

Comments
 (0)