-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathrun.csx
More file actions
150 lines (124 loc) · 4.73 KB
/
run.csx
File metadata and controls
150 lines (124 loc) · 4.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#r "System.Drawing"
#r "System.IO"
using System.Text;
using System.IO;
using System.Drawing.Imaging;
using iTextSharp;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using Microsoft.ProjectOxford.Vision;
public static async Task<string> Run(Stream pdfStream, string name, TraceWriter log)
{
var imageList = GetImages(pdfStream);
log.Info($"{name} file contains {imageList.Count} pages");
var resultDocument = new StringBuilder();
foreach (var eachImage in imageList)
{
using (var ms = new MemoryStream())
{
eachImage.Save(ms, ImageFormat.Png);
ms.Seek(0, SeekOrigin.Begin);
string text = await CovertImageToText(ms);
resultDocument.Append($"{text}");
}
}
return resultDocument.ToString();
}
public static List<System.Drawing.Image> GetImages(Stream pdfStream)
{
List<System.Drawing.Image> images = new List<System.Drawing.Image>();
using (var reader = new PdfReader(pdfStream))
{
var parser = new PdfReaderContentParser(reader);
ImageRenderListener listener = null;
for (var pageNum = 1; pageNum <= reader.NumberOfPages; pageNum++)
{
listener = new ImageRenderListener();
parser.ProcessContent(pageNum, listener);
if (listener.Images.Count > 0)
{
foreach (var each in listener.Images)
{
images.Add(each.Key);
}
}
}
}
return images;
}
internal class ImageRenderListener : IRenderListener
{
Dictionary<System.Drawing.Image, string> images = new Dictionary<System.Drawing.Image, string>();
public Dictionary<System.Drawing.Image, string> Images
{
get { return images; }
}
public void BeginTextBlock() { }
public void EndTextBlock() { }
public void RenderImage(ImageRenderInfo renderInfo)
{
PdfImageObject image = renderInfo.GetImage();
PdfName filter = (PdfName)image.Get(PdfName.FILTER);
//int width = Convert.ToInt32(image.Get(PdfName.WIDTH).ToString());
//int bitsPerComponent = Convert.ToInt32(image.Get(PdfName.BITSPERCOMPONENT).ToString());
//string subtype = image.Get(PdfName.SUBTYPE).ToString();
//int height = Convert.ToInt32(image.Get(PdfName.HEIGHT).ToString());
//int length = Convert.ToInt32(image.Get(PdfName.LENGTH).ToString());
//string colorSpace = image.Get(PdfName.COLORSPACE).ToString();
/* It appears to be safe to assume that when filter == null, PdfImageObject
* does not know how to decode the image to a System.Drawing.Image.
*
* Uncomment the code above to verify, but when I've seen this happen,
* width, height and bits per component all equal zero as well. */
if (filter != null)
{
System.Drawing.Image drawingImage = image.GetDrawingImage();
string extension = ".";
if (filter == PdfName.DCTDECODE)
{
extension += PdfImageObject.ImageBytesType.JPG.FileExtension;
}
else if (filter == PdfName.JPXDECODE)
{
extension += PdfImageObject.ImageBytesType.JP2.FileExtension;
}
else if (filter == PdfName.FLATEDECODE)
{
extension += PdfImageObject.ImageBytesType.PNG.FileExtension;
}
else if (filter == PdfName.LZWDECODE)
{
extension += PdfImageObject.ImageBytesType.CCITT.FileExtension;
}
/* Rather than struggle with the image stream and try to figure out how to handle
* BitMapData scan lines in various formats (like virtually every sample I've found
* online), use the PdfImageObject.GetDrawingImage() method, which does the work for us. */
this.Images.Add(drawingImage, extension);
}
}
public void RenderText(TextRenderInfo renderInfo) { }
}
public static async Task<string> CovertImageToText(Stream imgStream)
{
var client = new VisionServiceClient(
GetEnv("VISION_API_KEY"),
GetEnv("VISION_API_ENDPOINT"));
var ocrResult = await client.RecognizeTextAsync(imgStream);
var resultString = new StringBuilder();
foreach (var eachRegion in ocrResult.Regions)
{
foreach (var eachLine in eachRegion.Lines)
{
foreach (var eachWord in eachLine.Words)
{
resultString.Append($"{eachWord.Text} ");
}
resultString.AppendLine();
}
}
return resultString.ToString();
}
public static string GetEnv(string name)
{
return System.Environment.GetEnvironmentVariable(name, EnvironmentVariableTarget.Process);
}