|  | 
|  | 1 | +using Microsoft.AspNetCore.Mvc; | 
|  | 2 | +using OCR_with_Tesseract_in_Docker_on_Linux.Models; | 
|  | 3 | +using Syncfusion.Drawing; | 
|  | 4 | +using Syncfusion.OCRProcessor; | 
|  | 5 | +using Syncfusion.Pdf.Graphics; | 
|  | 6 | +using Syncfusion.Pdf.Parsing; | 
|  | 7 | +using System.Diagnostics; | 
|  | 8 | +using System.Xml.Linq; | 
|  | 9 | + | 
|  | 10 | +namespace OCR_with_Tesseract_in_Docker_on_Linux.Controllers | 
|  | 11 | +{ | 
|  | 12 | + public class HomeController : Controller | 
|  | 13 | + { | 
|  | 14 | + private readonly ILogger<HomeController> _logger; | 
|  | 15 | + | 
|  | 16 | + public HomeController(ILogger<HomeController> logger) | 
|  | 17 | + { | 
|  | 18 | + _logger = logger; | 
|  | 19 | + } | 
|  | 20 | + | 
|  | 21 | + public IActionResult Index() | 
|  | 22 | + { | 
|  | 23 | + return View(); | 
|  | 24 | + } | 
|  | 25 | + | 
|  | 26 | + public IActionResult Privacy() | 
|  | 27 | + { | 
|  | 28 | + return View(); | 
|  | 29 | + } | 
|  | 30 | + public IActionResult PerformOCR() | 
|  | 31 | + { | 
|  | 32 | + string docPath = Path.GetFullPath(@"Data/Input.pdf"); | 
|  | 33 | + //Initialize the OCR processor. | 
|  | 34 | + using (OCRProcessor processor = new OCRProcessor()) | 
|  | 35 | + { | 
|  | 36 | + FileStream fileStream = new FileStream(docPath, FileMode.Open, FileAccess.Read); | 
|  | 37 | + //Load a PDF document | 
|  | 38 | + PdfLoadedDocument lDoc = new PdfLoadedDocument(fileStream); | 
|  | 39 | + //Set OCR language to process | 
|  | 40 | + processor.Settings.Language = Languages.English; | 
|  | 41 | + IOcrEngine tesseractEngine = new Tesseract5OCREngine(); | 
|  | 42 | + processor.ExternalEngine = tesseractEngine; | 
|  | 43 | + //Process OCR by providing the PDF document. | 
|  | 44 | + processor.PerformOCR(lDoc); | 
|  | 45 | + //Create memory stream | 
|  | 46 | + using (MemoryStream stream = new MemoryStream()) | 
|  | 47 | + { | 
|  | 48 | + //Save the document to memory stream | 
|  | 49 | + lDoc.Save(stream); | 
|  | 50 | + lDoc.Close(); | 
|  | 51 | + //Set the position as '0' | 
|  | 52 | + stream.Position = 0; | 
|  | 53 | + //Download the PDF document in the browser | 
|  | 54 | + FileStreamResult fileStreamResult = new FileStreamResult(stream, "application/pdf"); | 
|  | 55 | + fileStreamResult.FileDownloadName = "Sample.pdf"; | 
|  | 56 | + return fileStreamResult; | 
|  | 57 | + } | 
|  | 58 | + } | 
|  | 59 | + | 
|  | 60 | + } | 
|  | 61 | + [ResponseCache(Duration = 0, Location = ResponseCacheLocation.None, NoStore = true)] | 
|  | 62 | + public IActionResult Error() | 
|  | 63 | + { | 
|  | 64 | + return View(new ErrorViewModel { RequestId = Activity.Current?.Id ?? HttpContext.TraceIdentifier }); | 
|  | 65 | + } | 
|  | 66 | + } | 
|  | 67 | + // Tesseract5OcrEngine implementation | 
|  | 68 | + class Tesseract5OCREngine : IOcrEngine | 
|  | 69 | + { | 
|  | 70 | + private float imageHeight; | 
|  | 71 | + private float imageWidth; | 
|  | 72 | + | 
|  | 73 | + public OCRLayoutResult PerformOCR(Stream stream) | 
|  | 74 | + { | 
|  | 75 | + if (stream == null || !stream.CanRead) | 
|  | 76 | + throw new ArgumentException("Input stream is null or not readable for OCR.", nameof(stream)); | 
|  | 77 | + | 
|  | 78 | + stream.Position = 0; | 
|  | 79 | + | 
|  | 80 | + using (MemoryStream tempMemStream = new MemoryStream()) | 
|  | 81 | + { | 
|  | 82 | + stream.CopyTo(tempMemStream); | 
|  | 83 | + tempMemStream.Position = 0; | 
|  | 84 | + PdfTiffImage pdfTiffImage = new PdfTiffImage(tempMemStream); | 
|  | 85 | + imageHeight = pdfTiffImage.Height; | 
|  | 86 | + imageWidth = pdfTiffImage.Width; | 
|  | 87 | + } | 
|  | 88 | + | 
|  | 89 | + string tempImageFile = Path.GetTempFileName(); | 
|  | 90 | + string tempHocrFile = tempImageFile + ".hocr"; | 
|  | 91 | + | 
|  | 92 | + // Write stream to temp image file | 
|  | 93 | + using (FileStream tempFileStream = new FileStream(tempImageFile, FileMode.Create, FileAccess.Write)) | 
|  | 94 | + { | 
|  | 95 | + stream.Position = 0; | 
|  | 96 | + stream.CopyTo(tempFileStream); | 
|  | 97 | + } | 
|  | 98 | + | 
|  | 99 | + ProcessStartInfo startInfo = new ProcessStartInfo | 
|  | 100 | + { | 
|  | 101 | + FileName = "tesseract", | 
|  | 102 | + Arguments = $"\"{tempImageFile}\" \"{tempImageFile}\" -l eng hocr", | 
|  | 103 | + RedirectStandardError = true, | 
|  | 104 | + UseShellExecute = false, | 
|  | 105 | + CreateNoWindow = true | 
|  | 106 | + }; | 
|  | 107 | + | 
|  | 108 | + string hocrText = null; | 
|  | 109 | + using (Process process = new Process { StartInfo = startInfo }) | 
|  | 110 | + { | 
|  | 111 | + process.Start(); | 
|  | 112 | + string errorOutput = process.StandardError.ReadToEnd(); | 
|  | 113 | + process.WaitForExit(); | 
|  | 114 | + | 
|  | 115 | + if (process.ExitCode != 0) | 
|  | 116 | + throw new Exception($"Tesseract process failed with exit code {process.ExitCode}. Error: {errorOutput}"); | 
|  | 117 | + | 
|  | 118 | + if (!File.Exists(tempHocrFile)) | 
|  | 119 | + throw new Exception("HOCR output file not found. Tesseract might have failed or not produced output."); | 
|  | 120 | + | 
|  | 121 | + hocrText = File.ReadAllText(tempHocrFile); | 
|  | 122 | + } | 
|  | 123 | + | 
|  | 124 | + // Clean up temp files | 
|  | 125 | + if (File.Exists(tempImageFile)) File.Delete(tempImageFile); | 
|  | 126 | + if (File.Exists(tempHocrFile)) File.Delete(tempHocrFile); | 
|  | 127 | + | 
|  | 128 | + if (string.IsNullOrEmpty(hocrText)) | 
|  | 129 | + throw new Exception("HOCR text could not be generated or was empty."); | 
|  | 130 | + | 
|  | 131 | + var ocrLayoutResult = new OCRLayoutResult(); | 
|  | 132 | + BuildOCRLayoutResult(ocrLayoutResult, hocrText, imageWidth, imageHeight); | 
|  | 133 | + ocrLayoutResult.ImageWidth = imageWidth; | 
|  | 134 | + ocrLayoutResult.ImageHeight = imageHeight; | 
|  | 135 | + | 
|  | 136 | + return ocrLayoutResult; | 
|  | 137 | + } | 
|  | 138 | + | 
|  | 139 | + void BuildOCRLayoutResult(OCRLayoutResult ocr, string hOcrText, float imageWidth, float imageHeight) | 
|  | 140 | + { | 
|  | 141 | + var doc = XDocument.Parse(hOcrText, LoadOptions.None); | 
|  | 142 | + var ns = "http://www.w3.org/1999/xhtml"; | 
|  | 143 | + | 
|  | 144 | + foreach (var pageElement in doc.Descendants(ns + "div").Where(d => d.Attribute("class")?.Value == "ocr_page")) | 
|  | 145 | + { | 
|  | 146 | + Page ocrPage = new Page(); | 
|  | 147 | + | 
|  | 148 | + foreach (var lineElement in pageElement.Descendants(ns + "span") | 
|  | 149 | + .Where(s => s.Attribute("class")?.Value == "ocr_line" || s.Attribute("class")?.Value == "ocr_header")) | 
|  | 150 | + { | 
|  | 151 | + Line ocrLine = new Line(); | 
|  | 152 | + | 
|  | 153 | + foreach (var wordElement in lineElement.Descendants(ns + "span") | 
|  | 154 | + .Where(s => s.Attribute("class")?.Value == "ocrx_word")) | 
|  | 155 | + { | 
|  | 156 | + Word ocrWord = new Word { Text = wordElement.Value }; | 
|  | 157 | + String title = wordElement.Attribute("title")?.Value; | 
|  | 158 | + | 
|  | 159 | + if (title != null) | 
|  | 160 | + { | 
|  | 161 | + String bboxString = title.Split(';')[0].Replace("bbox", "").Trim(); | 
|  | 162 | + int[] coords = bboxString.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(int.Parse).ToArray(); | 
|  | 163 | + | 
|  | 164 | + if (coords.Length == 4) | 
|  | 165 | + { | 
|  | 166 | + float x = coords[0]; | 
|  | 167 | + float y = coords[1]; | 
|  | 168 | + float width = coords[2] - coords[0]; | 
|  | 169 | + float height = coords[3] - coords[1]; | 
|  | 170 | + ocrWord.Rectangle = new RectangleF(x, y, width, height); | 
|  | 171 | + } | 
|  | 172 | + } | 
|  | 173 | + | 
|  | 174 | + ocrLine.Add(ocrWord); | 
|  | 175 | + } | 
|  | 176 | + | 
|  | 177 | + ocrPage.Add(ocrLine); | 
|  | 178 | + } | 
|  | 179 | + | 
|  | 180 | + ocr.Add(ocrPage); | 
|  | 181 | + } | 
|  | 182 | + } | 
|  | 183 | + } | 
|  | 184 | +} | 
0 commit comments