-
Notifications
You must be signed in to change notification settings - Fork 0
/
ServiceTesseract.cs
134 lines (114 loc) · 4.68 KB
/
ServiceTesseract.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
namespace ConsoleTesseractOcr
{
public class TesseractService
{
private readonly string _tesseractExePath;
private readonly string _language;
/// <summary>
/// Initializes a new instance of the <see cref="TesseractService"/> class.
/// </summary>
/// <param name="tesseractDir">The path for the Tesseract4 installation folder (C:\Program Files\Tesseract-OCR).</param>
/// <param name="language">The language used to extract text from images (eng, por, etc)</param>
/// <param name="dataDir">The data with the trained models (tessdata). Download the models from https://github.com/tesseract-ocr/tessdata_fast</param>
public TesseractService(string tesseractDir, string language = "en", string dataDir = null)
{
// Tesseract configs.
_tesseractExePath = Path.Combine(tesseractDir, "tesseract.exe");
_language = language;
if (String.IsNullOrEmpty(dataDir))
dataDir = Path.Combine(tesseractDir, "tessdata");
Environment.SetEnvironmentVariable("TESSDATA_PREFIX", dataDir);
}
/// <summary>
/// Read text from the images streams.
/// </summary>
/// <param name="images">The images streams.</param>
/// <returns>The images text.</returns>
public string GetText(params Stream[] images)
{
var output = string.Empty;
if (images.Any())
{
var tempPath = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString());
Directory.CreateDirectory(tempPath);
var tempInputFile = NewTempFileName(tempPath);
var tempOutputFile = NewTempFileName(tempPath);
try
{
WriteInputFiles(images, tempPath, tempInputFile);
var info = new ProcessStartInfo
{
FileName = _tesseractExePath,
Arguments = $"{tempInputFile} {tempOutputFile} -l {_language}",
RedirectStandardError = true,
RedirectStandardOutput = true,
CreateNoWindow = true,
UseShellExecute = false
};
using (var ps = Process.Start(info))
{
ps.WaitForExit();
var exitCode = ps.ExitCode;
if (exitCode == 0)
{
output = File.ReadAllText(tempOutputFile + ".txt");
}
else
{
var stderr = ps.StandardError.ReadToEnd();
throw new InvalidOperationException(stderr);
}
}
}
finally
{
Directory.Delete(tempPath, true);
}
}
return output;
}
private static void WriteInputFiles(Stream[] inputStreams, string tempPath, string tempInputFile)
{
// If there is more thant one image file, so build the list file using the images as input files.
if (inputStreams.Length > 1)
{
var imagesListFileContent = new StringBuilder();
foreach (var inputStream in inputStreams)
{
var imageFile = NewTempFileName(tempPath);
using (var tempStream = File.OpenWrite(imageFile))
{
CopyStream(inputStream, tempStream);
}
imagesListFileContent.AppendLine(imageFile);
}
File.WriteAllText(tempInputFile, imagesListFileContent.ToString());
}
else
{
// If is only one image file, than use the image file as input file.
using (var tempStream = File.OpenWrite(tempInputFile))
{
CopyStream(inputStreams.First(), tempStream);
}
}
}
private static void CopyStream(Stream input, Stream output)
{
if (input.CanSeek)
input.Seek(0, SeekOrigin.Begin);
input.CopyTo(output);
input.Close();
}
private static string NewTempFileName(string tempPath)
{
return Path.Combine(tempPath, Guid.NewGuid().ToString());
}
}
}