diff --git a/README.md b/README.md index 4ccd7f100..6f1611ad5 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,46 @@ Integrated into [Huggingface Spaces 🤗](https://huggingface.co/spaces) using [ ![example3](examples/example3.png) +## Getting character level detector bounding box results: +The number of character detection might be different with result, but it should be close or the same. +```python +# increasing the link_threshold can break bbox detection into character level bboxes +result = ocr.readtext(image, link_threshold=1-1e-100) + +# increasing the link_threshold can break bbox detection into character level bboxes +textBoxList = ocr.detector_text_box_list +batchTextBoxIndices = ocr.detector_text_box_indices + + +im = Image.open(image, formats=['png']) +draw = ImageDraw.Draw(im) +for batch, textBoxIndices in enumerate(batchTextBoxIndices): + for i, bboxCharacterIndices in enumerate(textBoxIndices): + if type(indices) == list: + for bboxCharacterIndex in indices: + + if type(indices) == list: + # this is horizontal list + box = textBoxList[batch][bboxCharacterIndex] + x_min = np.min(box[::2]) + x_max = np.max(box[::2]) + y_min = np.min(box[1::2]) + y_max = np.max(box[1::2]) + + draw.rectangle([x_min, y_min, x_max, y_max], width=2, outline=(255,0,0)) + elif type(indices) == int: + # this is free idx + box = textBoxList[batch][indices] + x_min = np.min(box[::2]) + x_max = np.max(box[::2]) + y_min = np.min(box[1::2]) + y_max = np.max(box[1::2]) + + draw.rectangle([x_min, y_min, x_max, y_max], width=2, outline=(255,0,0)) +im.save('characterBox.png') +``` + + ## Installation Install using `pip` diff --git a/easyocr/easyocr.py b/easyocr/easyocr.py index 4ef943401..811b576aa 100644 --- a/easyocr/easyocr.py +++ b/easyocr/easyocr.py @@ -228,6 +228,8 @@ def __init__(self, lang_list, gpu=True, model_storage_directory=None, self.recognizer, self.converter = get_recognizer(recog_network, network_params,\ self.character, separator_list,\ dict_list, model_path, device = self.device, quantize=quantize) + self.detector_text_box_indices = None + self.detector_text_box_list = None def getDetectorPath(self, detect_network): if detect_network in self.support_detection_network: @@ -332,19 +334,31 @@ def detect(self, img, min_size = 20, text_threshold = 0.7, low_text = 0.4,\ ) horizontal_list_agg, free_list_agg = [], [] + horizontal_list_agg_idx, free_list_agg_idx = [], [] + for text_box in text_box_list: - horizontal_list, free_list = group_text_box(text_box, slope_ths, + horizontal_list, free_list, craft_list_idx, free_idx = group_text_box(text_box, slope_ths, ycenter_ths, height_ths, width_ths, add_margin, (optimal_num_chars is None)) if min_size: + craft_list_idx = [idx for i, idx in zip(horizontal_list, craft_list_idx) if max( + i[1] - i[0], i[3] - i[2]) > min_size] + free_idx = [idx for i, idx in zip(free_list, free_idx) if max( + diff([c[0] for c in i]), diff([c[1] for c in i])) > min_size] horizontal_list = [i for i in horizontal_list if max( i[1] - i[0], i[3] - i[2]) > min_size] free_list = [i for i in free_list if max( diff([c[0] for c in i]), diff([c[1] for c in i])) > min_size] + horizontal_list_agg.append(horizontal_list) free_list_agg.append(free_list) + horizontal_list_agg_idx.append(craft_list_idx) + free_list_agg_idx.append(free_idx) + self.detector_text_box_list = text_box_list + self.detector_text_box_indices = [free + hori for hori, free in zip(horizontal_list_agg_idx, free_list_agg_idx)] + return horizontal_list_agg, free_list_agg def recognize(self, img_cv_grey, horizontal_list=None, free_list=None,\ @@ -352,7 +366,8 @@ def recognize(self, img_cv_grey, horizontal_list=None, free_list=None,\ workers = 0, allowlist = None, blocklist = None, detail = 1,\ rotation_info = None,paragraph = False,\ contrast_ths = 0.1,adjust_contrast = 0.5, filter_ths = 0.003,\ - y_ths = 0.5, x_ths = 1.0, reformat=True, output_format='standard'): + y_ths = 0.5, x_ths = 1.0, reformat=True, output_format='standard',\ + textbox_indices=None ): if reformat: img, img_cv_grey = reformat_input(img_cv_grey) @@ -374,25 +389,25 @@ def recognize(self, img_cv_grey, horizontal_list=None, free_list=None,\ # without gpu/parallelization, it is faster to process image one by one if ((batch_size == 1) or (self.device == 'cpu')) and not rotation_info: result = [] - for bbox in horizontal_list: + for i, bbox in enumerate(horizontal_list): h_list = [bbox] f_list = [] - image_list, max_width = get_image_list(h_list, f_list, img_cv_grey, model_height = imgH) + image_list, max_width, = get_image_list(h_list, f_list, img_cv_grey, model_height = imgH, textbox_indices=textbox_indices[len(free_list)+i: len(free_list)+i+1]) result0 = get_text(self.character, imgH, int(max_width), self.recognizer, self.converter, image_list,\ ignore_char, decoder, beamWidth, batch_size, contrast_ths, adjust_contrast, filter_ths,\ workers, self.device) result += result0 - for bbox in free_list: + for i, bbox in enumerate(free_list): h_list = [] f_list = [bbox] - image_list, max_width = get_image_list(h_list, f_list, img_cv_grey, model_height = imgH) + image_list, max_width = get_image_list(h_list, f_list, img_cv_grey, model_height = imgH, textbox_indices=textbox_indices[i:i+1]) result0 = get_text(self.character, imgH, int(max_width), self.recognizer, self.converter, image_list,\ ignore_char, decoder, beamWidth, batch_size, contrast_ths, adjust_contrast, filter_ths,\ workers, self.device) result += result0 # default mode will try to process multiple boxes at the same time else: - image_list, max_width = get_image_list(horizontal_list, free_list, img_cv_grey, model_height = imgH) + image_list, max_width = get_image_list(horizontal_list, free_list, img_cv_grey, model_height = imgH, textbox_indices=textbox_indices) image_len = len(image_list) if rotation_info and image_list: image_list = make_rotated_img_list(rotation_info, image_list) @@ -418,13 +433,21 @@ def recognize(self, img_cv_grey, horizontal_list=None, free_list=None,\ if paragraph: result = get_paragraph(result, x_ths=x_ths, y_ths=y_ths, mode = direction_mode) + + if rotation_info is not None: + # added rotation info that gives the best result + result = [item[:3] + (([0]+rotation_info)[item[3]], ) for item in result] if detail == 0: return [item[1] for item in result] elif output_format == 'dict': - return [ {'boxes':item[0],'text':item[1],'confident':item[2]} for item in result] + if rotation_info is not None: + return [ {'boxes':item[0],'text':item[1],'confident':item[2]} for item in result] + return [ {'boxes':item[0],'text':item[1],'confident':item[2], 'rotation_idx': item[3]} for item in result] elif output_format == 'json': - return [json.dumps({'boxes':[list(map(int, lst)) for lst in item[0]],'text':item[1],'confident':item[2]}, ensure_ascii=False) for item in result] + if rotation_info is not None: + return [json.dumps({'boxes':[list(map(int, lst)) for lst in item[0]],'text':item[1],'confident':item[2]}, ensure_ascii=False) for item in result] + return [json.dumps({'boxes':[list(map(int, lst)) for lst in item[0]],'text':item[1],'confident':item[2], 'rotation_idx': item[3]}, ensure_ascii=False) for item in result] else: return result @@ -455,12 +478,12 @@ def readtext(self, image, decoder = 'greedy', beamWidth= 5, batch_size = 1,\ bbox_min_size = bbox_min_size, max_candidates = max_candidates ) # get the 1st result from hor & free list as self.detect returns a list of depth 3 - horizontal_list, free_list = horizontal_list[0], free_list[0] + horizontal_list, free_list, textbox_indices = horizontal_list[0], free_list[0], self.detector_text_box_indices[0] result = self.recognize(img_cv_grey, horizontal_list, free_list,\ decoder, beamWidth, batch_size,\ workers, allowlist, blocklist, detail, rotation_info,\ paragraph, contrast_ths, adjust_contrast,\ - filter_ths, y_ths, x_ths, False, output_format) + filter_ths, y_ths, x_ths, False, output_format, textbox_indices) return result diff --git a/easyocr/utils.py b/easyocr/utils.py index 64435cfdb..b5ce136ec 100644 --- a/easyocr/utils.py +++ b/easyocr/utils.py @@ -407,9 +407,11 @@ def four_point_transform(image, rect): def group_text_box(polys, slope_ths = 0.1, ycenter_ths = 0.5, height_ths = 0.5, width_ths = 1.0, add_margin = 0.05, sort_output = True): # poly top-left, top-right, low-right, low-left - horizontal_list, free_list,combined_list, merged_list = [],[],[],[] + horizontal_list, free_list, combined_list, merged_list = [],[],[],[] + horizontal_idx, free_idx, combined_idx, merged_idx = [],[],[],[] - for poly in polys: + # this part just differentiate between boxes with high slope (free), or just normal horizontal texts (horizontal_list) + for i, poly in enumerate(polys): slope_up = (poly[3]-poly[1])/np.maximum(10, (poly[2]-poly[0])) slope_down = (poly[5]-poly[7])/np.maximum(10, (poly[4]-poly[6])) if max(abs(slope_up), abs(slope_down)) < slope_ths: @@ -418,6 +420,7 @@ def group_text_box(polys, slope_ths = 0.1, ycenter_ths = 0.5, height_ths = 0.5, y_max = max([poly[1],poly[3],poly[5],poly[7]]) y_min = min([poly[1],poly[3],poly[5],poly[7]]) horizontal_list.append([x_min, x_max, y_min, y_max, 0.5*(y_min+y_max), y_max-y_min]) + horizontal_idx.append(i) else: height = np.linalg.norm([poly[6]-poly[0],poly[7]-poly[1]]) width = np.linalg.norm([poly[2]-poly[0],poly[3]-poly[1]]) @@ -436,59 +439,77 @@ def group_text_box(polys, slope_ths = 0.1, ycenter_ths = 0.5, height_ths = 0.5, x4 = poly[6] - np.cos(theta24)*margin y4 = poly[7] + np.sin(theta24)*margin + free_idx.append(i) free_list.append([[x1,y1],[x2,y2],[x3,y3],[x4,y4]]) if sort_output: + horizontal_idx = [x for _,x in sorted(zip(horizontal_list,horizontal_idx), key=lambda pair: pair[0][4])] horizontal_list = sorted(horizontal_list, key=lambda item: item[4]) # combine box + # this part combine boxes based on horizontal lines new_box = [] - for poly in horizontal_list: + new_box_idx = [] + for poly, idx in zip(horizontal_list, horizontal_idx): if len(new_box) == 0: b_height = [poly[5]] b_ycenter = [poly[4]] new_box.append(poly) + new_box_idx.append(idx) else: # comparable height and comparable y_center level up to ths*height if abs(np.mean(b_ycenter) - poly[4]) < ycenter_ths*np.mean(b_height): b_height.append(poly[5]) b_ycenter.append(poly[4]) + new_box_idx.append(idx) new_box.append(poly) else: b_height = [poly[5]] b_ycenter = [poly[4]] combined_list.append(new_box) + combined_idx.append(new_box_idx) + new_box_idx = [idx] new_box = [poly] + combined_idx.append(new_box_idx) combined_list.append(new_box) # merge list use sort again - for boxes in combined_list: + for boxes, index in zip(combined_list, combined_idx): if len(boxes) == 1: # one box per line box = boxes[0] margin = int(add_margin*min(box[1]-box[0],box[5])) + merged_idx.append(index) merged_list.append([box[0]-margin,box[1]+margin,box[2]-margin,box[3]+margin]) else: # multiple boxes per line + index = [x for _,x in sorted(zip(boxes,index), key=lambda pair: pair[0][0])] boxes = sorted(boxes, key=lambda item: item[0]) merged_box, new_box = [],[] - for box in boxes: + merged_box_idx, new_box_idx = [],[] + assert len(boxes) == len(index) + for box, idx in zip(boxes, index): if len(new_box) == 0: b_height = [box[5]] x_max = box[1] + new_box_idx.append(idx) new_box.append(box) else: if (abs(np.mean(b_height) - box[5]) < height_ths*np.mean(b_height)) and ((box[0]-x_max) < width_ths *(box[3]-box[2])): # merge boxes b_height.append(box[5]) x_max = box[1] + new_box_idx.append(idx) new_box.append(box) else: b_height = [box[5]] x_max = box[1] + merged_box_idx.append(new_box_idx) merged_box.append(new_box) + new_box_idx = [idx] new_box = [box] - if len(new_box) >0: merged_box.append(new_box) - - for mbox in merged_box: + if len(new_box) >0: + merged_box_idx.append(new_box_idx) + merged_box.append(new_box) + for mbox, mbox_idx in zip(merged_box, merged_box_idx): if len(mbox) != 1: # adjacent box in same line # do I need to add margin here? x_min = min(mbox, key=lambda x: x[0])[0] @@ -500,6 +521,7 @@ def group_text_box(polys, slope_ths = 0.1, ycenter_ths = 0.5, height_ths = 0.5, box_height = y_max - y_min margin = int(add_margin * (min(box_width, box_height))) + merged_idx.append(mbox_idx) merged_list.append([x_min-margin, x_max+margin, y_min-margin, y_max+margin]) else: # non adjacent box in same line box = mbox[0] @@ -508,9 +530,11 @@ def group_text_box(polys, slope_ths = 0.1, ycenter_ths = 0.5, height_ths = 0.5, box_height = box[3] - box[2] margin = int(add_margin * (min(box_width, box_height))) + merged_idx.append(mbox_idx) merged_list.append([box[0]-margin,box[1]+margin,box[2]-margin,box[3]+margin]) + # may need to check if box is really in image - return merged_list, free_list + return merged_list, free_list, merged_idx, free_idx def calculate_ratio(width,height): ''' @@ -535,7 +559,7 @@ def compute_ratio_and_resize(img,width,height,model_height): return img,ratio -def get_image_list(horizontal_list, free_list, img, model_height = 64, sort_output = True): +def get_image_list(horizontal_list, free_list, img, model_height = 64, sort_output = True, textbox_indices=None): image_list = [] maximum_y,maximum_x = img.shape @@ -577,7 +601,10 @@ def get_image_list(horizontal_list, free_list, img, model_height = 64, sort_outp max_width = math.ceil(max_ratio)*model_height if sort_output: + if textbox_indices is not None: + textbox_indices[:] = [x for _,x in sorted(zip(image_list, textbox_indices), key= lambda pair: pair[0][0][0][1])] image_list = sorted(image_list, key=lambda item: item[0][0][1]) # sort by vertical position + return image_list, max_width def download_and_unzip(url, filename, model_storage_directory, verbose=True): @@ -788,6 +815,7 @@ def set_result_with_confidence(results): best_row = max( [(row_ix, results[row_ix][col_ix][2]) for row_ix in range(len(results))], key=lambda x: x[1])[0] - final_result.append(results[best_row][col_ix]) + result_angle = results[best_row][col_ix] + (best_row, ) + final_result.append(result_angle) return final_result