forked from tabulapdf/tabula-java
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathTestOCRConverter.java
More file actions
45 lines (34 loc) · 1.38 KB
/
TestOCRConverter.java
File metadata and controls
45 lines (34 loc) · 1.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
package technology.tabula;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.junit.Test;
import technology.tabula.extractors.OcrConverter;
public class TestOCRConverter {
@Test
public void testConvert() {
try {
// create backup of wellExample_imageBased.pdf
File tmpFile = Files.createTempFile("", ".tmp").toFile();
File testFile = new File("src/test/resources/technology/tabula/wellExample_imageBased.pdf");
FileUtils.copyFile(testFile, tmpFile);
// convert document to text
OcrConverter ocrConverter = new OcrConverter();
boolean conversionResponse = ocrConverter.extract(testFile.getAbsolutePath(), null);
assertTrue(conversionResponse); // check for valid response
// check that some text is as expected
Page page = UtilsForTesting.getPage(testFile.getAbsolutePath(), 1);
List<TextElement> textElements = page.getText();
assertTrue(textElements.size() > 1200); // check that text was extracted and is around approximate acceptable limit
// this limit may change if Tesseract is updated
// restore original copy of wellExample_imageBased.pdf
FileUtils.copyFile(tmpFile, testFile);
} catch (IOException e) {
fail(e.getMessage());
}
}
}