Skip to content

Commit

Permalink
Update benchmarks and added cancellation test
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed Dec 20, 2024
1 parent f76ab8c commit d505aec
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 17 deletions.
17 changes: 13 additions & 4 deletions dedoc/api/process_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ async def handle(self, request: Request, parameters: dict, file_path: str, tmpdi
Handle request in a separate process.
Checks for client disconnection and terminate the child process if client disconnected.
"""
if not self.process.is_alive():
if self.process is None:
self.logger.info("Initialization of a new parsing process")
self.__init__(logger=self.logger)

self.logger.info("Putting file to the input queue")
Expand All @@ -56,7 +57,9 @@ async def handle(self, request: Request, parameters: dict, file_path: str, tmpdi
result = await future
except get_cancelled_exc_class():
self.logger.warning("Terminating the parsing process")
self.process.terminate()
if self.process is not None:
self.process.terminate()
self.process = None
future.cancel(DedocError)
return None

Expand Down Expand Up @@ -84,6 +87,7 @@ def __parse_file(self, input_queue: Queue, output_queue: Queue) -> None:
manager.logger.info("Parsing process is waiting for the task in the input queue")

while True:
file_path = None
try:
parameters, file_path, tmp_dir = pickle.loads(input_queue.get(block=True))
manager.logger.info("Parsing process got task from the input queue")
Expand All @@ -95,10 +99,15 @@ def __parse_file(self, input_queue: Queue, output_queue: Queue) -> None:

output_queue.put(pickle.dumps(document_tree.to_api_schema()), block=True)
manager.logger.info("Parsing process put task to the output queue")
except Exception as e:
except DedocError as e:
tb = traceback.format_exc()
manager.logger.error(f"Exception {e}\n{tb}")
manager.logger.error(f"Exception {e}: {e.msg_api}\n{tb}")
output_queue.put(pickle.dumps(e.__dict__), block=True)
except Exception as e:
exc_message = f"Exception {e}\n{traceback.format_exc()}"
filename = "" if file_path is None else os.path.basename(file_path)
manager.logger.error(exc_message)
output_queue.put(pickle.dumps({"msg": exc_message, "filename": filename}), block=True)

def __add_base64_info_to_attachments(self, document_tree: ParsedDocument, attachments_dir: str) -> None:
for attachment in document_tree.attachments:
Expand Down
24 changes: 12 additions & 12 deletions resources/benchmarks/time_benchmark.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
,Dataset,total_file_size,total_files,total_pages,total_time_raw,throughput_raw,mean_time_on_file_raw,mean_time_cpu_on_page_raw,total_time_indp_cpu,throughput_indp_cpu,mean_time_on_file_indp_cpu,mean_time_cpu_on_page_indp_cpu,cpu_performance,version
0,images,105240044,259,259,819.3893718719482,128437.16017401138,3.1636655284631208,3.1636655284631208,845.0002507880153,124544.39380563155,3.2625492308417576,3.262549230841758,1.0312560545636043,2.1
0,law_html,215921201,1526,1526,227.0532796382904,950971.5135759128,0.14878982938289018,0.14878982938289018,234.1500693355101,922148.7809623912,0.15344041240859116,0.15344041240859116,1.0312560545636043,2.1
0,other_html,215921201,1526,1526,156.9773073196411,1375493.0867831479,0.1028684844820715,0.1028684844820715,161.8837986024715,1333803.6472088536,0.10608374744591842,0.1060837474459184,1.0312560545636043,2.1
0,txt,2483851,999,999,13.047960042953491,190363.16725551253,0.013061021064017509,0.013061021064017509,13.455787793999773,184593.5026641549,0.013469257051050823,0.013469257051050825,1.0312560545636043,2.1
0,pdf_text_layer_true,109643533,33,445,417.5641739368439,262578.8797115134,12.653459816267997,1.0459708427522103,430.6155825412202,254620.4490626033,13.048957046703642,1.0786637644852126,1.0312560545636043,2.1
0,pdf_text_layer_auto,109643533,33,445,744.6476347446442,147242.16916045017,22.565079840746794,1.9358688088909384,767.9223818468816,142779.44697523108,23.270375207481262,1.9963764300096132,1.0312560545636043,2.1
0,pdf_text_layer_auto_tabby,109643533,33,445,861.5465660095215,127263.61792357055,26.107471697258227,2.408536994270351,888.475112485801,123406.42012271588,26.923488257145486,2.4838183579817246,1.0312560545636043,2.1
0,pdf_text_layer_false,109643533,33,445,1923.4744081497192,57002.85511231277,58.28710327726422,4.837624405643553,1983.5946292025433,55275.171340869965,60.10892815765283,4.988829458024572,1.0312560545636043,2.1
0,pdf_text_layer_tabby,109643533,33,445,459.48560762405396,238622.3445973723,13.923806291637998,1.2937336014756313,473.84731484714223,231390.00594604985,14.359009540822491,1.334170609514122,1.0312560545636043,2.1
0,docx,417727,22,22,16.942837953567505,24655.078514284138,0.770128997889432,0.770128997889432,17.472404221106515,23907.814557963888,0.794200191868478,0.794200191868478,1.0312560545636043,2.1
0,pdf,6086144,18,117,375.61194705963135,16203.275874592393,20.86733039220174,3.0367271868588284,387.35209457166883,15712.175267130062,21.519560809537158,3.131643297506068,1.0312560545636043,2.1
0,pdf_tables,16536264,2,267,1197.7023212909698,13806.656049706928,598.8511606454849,4.039958413717207,1235.137770396196,13388.193929731136,617.568885198098,4.166231574331044,1.0312560545636043,2.1
0,images,105240044,259,259,780.3763222694397,134858.06910946214,3.0130359933183,3.0130359933183,1066.0429167915163,98720.2694585152,4.115995817727862,4.115995817727862,1.366062611550437,2.3.1
0,law_html,215921201,1526,1526,204.2208013534546,1057292.8887214332,0.13382752382270943,0.13382752382270943,278.9784012298232,773971.0316216326,0.18281677669057877,0.18281677669057877,1.366062611550437,2.3.1
0,other_html,215921201,1526,1526,152.16186046600342,1419023.139824463,0.0997128836605527,0.0997128836605527,207.86262848656185,1038768.7415102572,0.13621404225855954,0.13621404225855951,1.366062611550437,2.3.1
0,txt,2483851,999,999,12.656875133514404,196245.20063589464,0.012669544678192597,0.012669544678192597,17.290083898956475,143657.5446663917,0.01730739129024672,0.01730739129024672,1.366062611550437,2.3.1
0,pdf_text_layer_true,109643533,33,445,294.70041608810425,372050.825225916,8.930315639033463,0.7666412830448923,402.57922002631614,272352.6887275323,12.199370303827761,1.0472799932386834,1.366062611550437,2.3.1
0,pdf_text_layer_auto,109643533,33,445,715.7886617183685,153178.6389809286,21.69056550661723,1.9423069545744724,977.8121285451869,112131.49213349436,29.63067056197536,2.65331291079858,1.366062611550437,2.3.1
0,pdf_text_layer_auto_tabby,109643533,33,445,844.7789170742035,129789.61806923167,25.59936112346071,2.380779043078811,1154.0208936411366,95010.00684143213,34.97033011033747,3.2522932371127915,1.366062611550437,2.3.1
0,pdf_text_layer_false,109643533,33,445,1591.9220836162567,68874.93686307219,48.240063139886566,3.9471288925826884,2174.665238929637,50418.57985184248,65.89894663423142,5.392025203127692,1.366062611550437,2.3.1
0,pdf_text_layer_tabby,109643533,33,445,421.8361530303955,259919.71577670728,12.782913728193803,1.1935813540785523,576.2545968550919,190269.25528816486,17.46226051076036,1.630506861650454,1.366062611550437,2.3.1
0,docx,417727,22,22,17.311132431030273,24130.541526631885,0.7868696559559215,0.7868696559559215,23.64809077762868,17664.30127184617,1.07491321716494,1.07491321716494,1.366062611550437,2.3.1
0,pdf,6086144,18,117,310.7921574115753,19582.68204284271,17.26623096730974,2.519494602322346,424.5615462030511,14335.127743974337,23.58675256683617,3.441787376235694,1.366062611550437,2.3.1
0,pdf_tables,16536264,2,267,1083.6798040866852,15259.363455551895,541.8399020433426,3.6198095974726074,1480.3744632551231,11170.325083586768,740.1872316275616,4.944886552038766,1.366062611550437,2.3.1
5 changes: 4 additions & 1 deletion scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,10 @@ def get_times(spend_page_times: List, total_size: int, total_time: int, total_fi
file_size = os.path.getsize(file_path)
total_size += file_size
time_start = time.time()
send_file(host=host, file_name=file, file_path=file_path, parameters=parameters)
try:
send_file(host=host, file_name=file, file_path=file_path, parameters=parameters)
except AssertionError as e:
print(f"Error on file {file_path}: {e}")
time_finish = time.time()
spend_file_time = time_finish - time_start
pages = page_func(file_path)
Expand Down
22 changes: 22 additions & 0 deletions tests/api_tests/test_api_misc_main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import json
import os
import time

import requests
from requests import ReadTimeout

from tests.api_tests.abstract_api_test import AbstractTestApiDocReader

Expand All @@ -14,6 +16,26 @@ def __get_version(self) -> str:
version = file.read().strip()
return version

def test_cancellation(self) -> None:
file_name = "article.pdf"
start_time = time.time()
with open(self._get_abs_path(os.path.join("pdf_with_text_layer", file_name)), "rb") as file:
files = {"file": (file_name, file)}
parameters = dict(pdf_with_text_layer=False)
try:
requests.post(f"http://{self._get_host()}:{self._get_port()}/upload", files=files, data=parameters, timeout=1)
except ReadTimeout:
pass

file_name = "example.txt"
with open(self._get_abs_path(os.path.join("txt", file_name)), "rb") as file:
files = {"file": (file_name, file)}
r = requests.post(f"http://{self._get_host()}:{self._get_port()}/upload", files=files, data={}, timeout=60)

end_time = time.time()
self.assertLess(end_time - start_time, 60)
self.assertEqual(200, r.status_code)

def test_bin_file(self) -> None:
file_name = "file.bin"
result = self._send_request(file_name, expected_code=415)
Expand Down

0 comments on commit d505aec

Please sign in to comment.