Fixing the OCR on server-side.

christian-oreilly · pafonta · commit 710d283e98d5 · 2018-08-21T11:57:03.000+02:00
For some reasons, the behavior of ocrmypdf seem to have change. Whereas before we were expecting directly the .txt file from it, now it was generating a PDF with the ocr-ed text overlaid to it. This commit fix this issue by overwriting the original scan PDF with a pdf with text overlaid and run the usual pdftotext on this new PDF.
diff --git a/nat/restServer.py b/nat/restServer.py
@@ -53,7 +53,8 @@ def runOCR(fileName):
         app.OCRLock.release()    
                 
         # Run OCR
-        run_ocrmypdf(fileName + ".pdf", fileName + ".txt")
+        run_ocrmypdf(fileName + ".pdf", fileName + ".pdf")
+        check_call(['pdftotext', '-enc', 'UTF-8', fileName + ".pdf", fileName + ".txt"])
  
         acquireLockWithTimeout()
         del app.OCRFiles[app.OCRFiles.index(fileName)]