-
Notifications
You must be signed in to change notification settings - Fork 19
/
ctc_scene_text_recognizer.cpp
179 lines (149 loc) · 6.93 KB
/
ctc_scene_text_recognizer.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#include "ctc_scene_text_recognizer.h"
CTCSceneTextRecognizer::CTCSceneTextRecognizer(){
init_constant_vars();
}
CTCSceneTextRecognizer::CTCSceneTextRecognizer(std::string frozen_graph_filename, std::string dictionary_filename, int _im_height, int _im_width):Recognizer(frozen_graph_filename, dictionary_filename){
init_constant_vars(_im_height, _im_width);
}
bool CTCSceneTextRecognizer::init(const std::string frozen_graph_filename, const std::string dictionary_filename){
this->init_graph(frozen_graph_filename);
this->init_dictionary(dictionary_filename);
return true;
}
void CTCSceneTextRecognizer::init_constant_vars(int _im_height, int _im_width){
std::string input_layer_string = "input_images:0,input_seq_lens:0";
std::string output_layer_string = "CTCBeamSearchDecoder:0,CTCBeamSearchDecoder:1,CTCBeamSearchDecoder:2";
this->input_layers = str_util::Split(input_layer_string, ',');
this->output_layers = str_util::Split(output_layer_string, ',');
this->seq_len = 29;
this->image_width = _im_width; //input image width;
this->image_height = _im_height; //input image height
this->width_scale_ratio = 1.2; //scale the width for better recognition
}
void CTCSceneTextRecognizer::preprocess_image(cv::Mat& input_image, cv::Mat& output_image){
cv::Mat resized_image, padded_image;
int new_width = int(this->width_scale_ratio * input_image.cols);
cv::resize(input_image, input_image, cv::Size(new_width, input_image.rows));
float ratio=0;
resize_image_fix_height(input_image, resized_image, ratio, this->image_height);
pad_image_width(resized_image, output_image, this->image_width);
}
string CTCSceneTextRecognizer::run_graph(const cv::Mat& image){
int height = image.rows;
int width = image.cols;
Tensor input_img_tensor(DT_FLOAT, TensorShape({1, height, width, 3}));
unsigned char *input_data = (unsigned char*)(image.data);
auto input_tensor_mapped = input_img_tensor.tensor<float, 4>();
//(TODO) is there any other ways to copy the data into tensor?
for (int y = 0;y < height; ++y) {
for (int x = 0;x < width; ++x) {
unsigned char b = input_data[image.step * y + x * image.channels()];
unsigned char g = input_data[image.step * y + x * image.channels() + 1];
unsigned char r = input_data[image.step * y + x * image.channels() + 2];
input_tensor_mapped(0, y, x, 0) = float(r);
input_tensor_mapped(0, y, x, 1) = float(g);
input_tensor_mapped(0, y, x, 2) = float(b);
}
}
//create the seq len tensor and assign fixed value
Tensor input_seq_len_tensor(DT_INT32, TensorShape({1}));
auto input_seq_len_mapped = input_seq_len_tensor.tensor<int, 1>();
input_seq_len_mapped(0) = this->seq_len;
//create the input to run
std::vector<std::pair<string, Tensor> > inputs = {
{this->input_layers[0], input_img_tensor},
{this->input_layers[1], input_seq_len_tensor},
};
std::vector<Tensor> outputs;
Status run_status = this->session->Run(inputs,
this->output_layers, {}, &outputs);
if (!run_status.ok()) {
LOG(ERROR) << "Running model failed: " << run_status;
return "";
}
LOG(INFO) <<"number of output:"<<outputs.size();
auto indices = outputs[0].flat_outer_dims<long long>();
auto values = outputs[1].flat_outer_dims<long long>();
const Eigen::Tensor<float, indices.NumDimensions>::Dimensions& indices_dim = indices.dimensions();
const Eigen::Tensor<float, values.NumDimensions>::Dimensions& values_dim = values.dimensions();
LOG(INFO) << outputs[0].DebugString();
LOG(INFO) << outputs[1].DebugString();
std::vector<int> encoded_text;
for(int i=0; i<values_dim[0]; i++){
for(int j=0; j<values_dim[1]; j++){
encoded_text.push_back(values(i,j));
}
}
std::string decoded_text = decode_single_text(encoded_text);
return decoded_text;
}
std::vector<cv::Mat> CTCSceneTextRecognizer::preprocess_images(std::vector<cv::Mat>& input_images){
std::vector<cv::Mat> processed_images(input_images.size());
for(int i=0; i<input_images.size(); i++){
cv::Mat preprocessed_image;
this->preprocess_image(input_images[i], preprocessed_image);
processed_images[i] = preprocessed_image;
}
return processed_images;
}
std::vector<std::string> CTCSceneTextRecognizer::run_graph(const std::vector<cv::Mat> images){
//the images must be preprocessd and has the same height and width!!
std::vector<std::string> res;
int num_word = images.size();
if(num_word == 0) return res;
int height = this->image_height;
int width = this->image_width;
Tensor input_img_tensor(DT_FLOAT, TensorShape({num_word, height, width, 3}));
auto input_tensor_mapped = input_img_tensor.tensor<float, 4>();
//create the seq len tensor and assign fixed value for ctc
Tensor input_seq_len_tensor(DT_INT32, TensorShape({num_word}));
auto input_seq_len_mapped = input_seq_len_tensor.tensor<int, 1>();
for(int i=0; i<num_word; i++){
const cv::Mat& image = images[i];
//std::cout<<"assign image to tensor"<<i<<" "<<image.rows<<" "<<image.cols<<std::endl;
assert (image.rows == height);
assert (image.cols == width);
const unsigned char *input_data = (const unsigned char*)(image.data);
//(TODO) is there any other ways to copy the data into tensor?
for (int y = 0;y < height; ++y) {
for (int x = 0;x < width; ++x) {
const unsigned char b = input_data[image.step * y + x * image.channels()];
const unsigned char g = input_data[image.step * y + x * image.channels() + 1];
const unsigned char r = input_data[image.step * y + x * image.channels() + 2];
input_tensor_mapped(i, y, x, 0) = float(r);
input_tensor_mapped(i, y, x, 1) = float(g);
input_tensor_mapped(i, y, x, 2) = float(b);
}
}
input_seq_len_mapped(i) = this->seq_len;
}
//create the input to run
std::vector<std::pair<string, Tensor> > inputs = {
{this->input_layers[0], input_img_tensor},
{this->input_layers[1], input_seq_len_tensor},
};
//std::cout<<"run recognition graph"<<std::endl;
std::vector<Tensor> outputs;
Status run_status = this->session->Run(inputs,
this->output_layers, {}, &outputs);
if (!run_status.ok()) {
LOG(ERROR) << "Running model failed: " << run_status;
return res;
}
LOG(INFO) <<"number of output:"<<outputs.size();
//std::cout<<outputs[0].DebugString()<<std::endl;
//std::cout<<outputs[1].DebugString()<<std::endl;
auto indices_shape = outputs[0].shape();
auto indices = outputs[0].tensor<long long, 2>();
auto values = outputs[1].tensor<long long, 1>();
//const Eigen::Tensor<float, indices.NumDimensions>::Dimensions& indices_dim = indices.dimensions();
//const Eigen::Tensor<float, values.NumDimensions>::Dimensions& values_dim = values.dimensions();
std::vector<std::vector<int> > encoded_texts(num_word);
for(int i=0; i<indices_shape.dim_size(0); i++){
encoded_texts[indices(i, 0)].push_back(values(i));
}
for(int i=0; i<num_word; i++){
res.push_back(decode_single_text(encoded_texts[i]));
}
return res;
}