Skip to content

Commit 1bf4f14

Browse files
committed
980096: Added ASP NET core MVC project.
1 parent ab393c5 commit 1bf4f14

File tree

80 files changed

+74936
-174
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+74936
-174
lines changed
Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,25 @@
11

22
Microsoft Visual Studio Solution File, Format Version 12.00
33
# Visual Studio Version 17
4-
VisualStudioVersion = 17.14.36408.4 d17.14
4+
VisualStudioVersion = 17.14.36616.10 d17.14
55
MinimumVisualStudioVersion = 10.0.40219.1
6-
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OCR-with-Tesseract-in-Docker-on-Linux", "OCR-with-Tesseract-in-Docker-on-Linux\OCR-with-Tesseract-in-Docker-on-Linux.csproj", "{92C3B623-ED53-4127-8161-975BCD7AA532}"
6+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OCR-with-Tesseract-in-Docker-on-Linux", "OCR-with-Tesseract-in-Docker-on-Linux\OCR-with-Tesseract-in-Docker-on-Linux.csproj", "{40EBF01A-F47E-433F-9C5F-1E118D6BE123}"
77
EndProject
88
Global
99
GlobalSection(SolutionConfigurationPlatforms) = preSolution
1010
Debug|Any CPU = Debug|Any CPU
1111
Release|Any CPU = Release|Any CPU
1212
EndGlobalSection
1313
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14-
{92C3B623-ED53-4127-8161-975BCD7AA532}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15-
{92C3B623-ED53-4127-8161-975BCD7AA532}.Debug|Any CPU.Build.0 = Debug|Any CPU
16-
{92C3B623-ED53-4127-8161-975BCD7AA532}.Release|Any CPU.ActiveCfg = Release|Any CPU
17-
{92C3B623-ED53-4127-8161-975BCD7AA532}.Release|Any CPU.Build.0 = Release|Any CPU
14+
{40EBF01A-F47E-433F-9C5F-1E118D6BE123}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15+
{40EBF01A-F47E-433F-9C5F-1E118D6BE123}.Debug|Any CPU.Build.0 = Debug|Any CPU
16+
{40EBF01A-F47E-433F-9C5F-1E118D6BE123}.Release|Any CPU.ActiveCfg = Release|Any CPU
17+
{40EBF01A-F47E-433F-9C5F-1E118D6BE123}.Release|Any CPU.Build.0 = Release|Any CPU
1818
EndGlobalSection
1919
GlobalSection(SolutionProperties) = preSolution
2020
HideSolutionNode = FALSE
2121
EndGlobalSection
2222
GlobalSection(ExtensibilityGlobals) = postSolution
23-
SolutionGuid = {BEF3B3F0-759C-4D53-BF94-8EB1E0E7D2FE}
23+
SolutionGuid = {82BE2249-6CF6-4098-8CFB-FE96ABAEE376}
2424
EndGlobalSection
2525
EndGlobal
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
using Microsoft.AspNetCore.Mvc;
2+
using OCR_with_Tesseract_in_Docker_on_Linux.Models;
3+
using Syncfusion.Drawing;
4+
using Syncfusion.OCRProcessor;
5+
using Syncfusion.Pdf.Graphics;
6+
using Syncfusion.Pdf.Parsing;
7+
using System.Diagnostics;
8+
using System.Xml.Linq;
9+
10+
namespace OCR_with_Tesseract_in_Docker_on_Linux.Controllers
11+
{
12+
public class HomeController : Controller
13+
{
14+
private readonly ILogger<HomeController> _logger;
15+
16+
public HomeController(ILogger<HomeController> logger)
17+
{
18+
_logger = logger;
19+
}
20+
21+
public IActionResult Index()
22+
{
23+
return View();
24+
}
25+
26+
public IActionResult Privacy()
27+
{
28+
return View();
29+
}
30+
public IActionResult PerformOCR()
31+
{
32+
string docPath = Path.GetFullPath(@"Data/Input.pdf");
33+
//Initialize the OCR processor.
34+
using (OCRProcessor processor = new OCRProcessor())
35+
{
36+
FileStream fileStream = new FileStream(docPath, FileMode.Open, FileAccess.Read);
37+
//Load a PDF document
38+
PdfLoadedDocument lDoc = new PdfLoadedDocument(fileStream);
39+
//Set OCR language to process
40+
processor.Settings.Language = Languages.English;
41+
IOcrEngine tesseractEngine = new Tesseract5OCREngine();
42+
processor.ExternalEngine = tesseractEngine;
43+
//Process OCR by providing the PDF document.
44+
processor.PerformOCR(lDoc);
45+
//Create memory stream
46+
using (MemoryStream stream = new MemoryStream())
47+
{
48+
//Save the document to memory stream
49+
lDoc.Save(stream);
50+
lDoc.Close();
51+
//Set the position as '0'
52+
stream.Position = 0;
53+
//Download the PDF document in the browser
54+
FileStreamResult fileStreamResult = new FileStreamResult(stream, "application/pdf");
55+
fileStreamResult.FileDownloadName = "Sample.pdf";
56+
return fileStreamResult;
57+
}
58+
}
59+
60+
}
61+
[ResponseCache(Duration = 0, Location = ResponseCacheLocation.None, NoStore = true)]
62+
public IActionResult Error()
63+
{
64+
return View(new ErrorViewModel { RequestId = Activity.Current?.Id ?? HttpContext.TraceIdentifier });
65+
}
66+
}
67+
// Tesseract5OcrEngine implementation
68+
class Tesseract5OCREngine : IOcrEngine
69+
{
70+
private float imageHeight;
71+
private float imageWidth;
72+
73+
public OCRLayoutResult PerformOCR(Stream stream)
74+
{
75+
if (stream == null || !stream.CanRead)
76+
throw new ArgumentException("Input stream is null or not readable for OCR.", nameof(stream));
77+
78+
stream.Position = 0;
79+
80+
using (MemoryStream tempMemStream = new MemoryStream())
81+
{
82+
stream.CopyTo(tempMemStream);
83+
tempMemStream.Position = 0;
84+
PdfTiffImage pdfTiffImage = new PdfTiffImage(tempMemStream);
85+
imageHeight = pdfTiffImage.Height;
86+
imageWidth = pdfTiffImage.Width;
87+
}
88+
89+
string tempImageFile = Path.GetTempFileName();
90+
string tempHocrFile = tempImageFile + ".hocr";
91+
92+
// Write stream to temp image file
93+
using (FileStream tempFileStream = new FileStream(tempImageFile, FileMode.Create, FileAccess.Write))
94+
{
95+
stream.Position = 0;
96+
stream.CopyTo(tempFileStream);
97+
}
98+
99+
ProcessStartInfo startInfo = new ProcessStartInfo
100+
{
101+
FileName = "tesseract",
102+
Arguments = $"\"{tempImageFile}\" \"{tempImageFile}\" -l eng hocr",
103+
RedirectStandardError = true,
104+
UseShellExecute = false,
105+
CreateNoWindow = true
106+
};
107+
108+
string hocrText = null;
109+
using (Process process = new Process { StartInfo = startInfo })
110+
{
111+
process.Start();
112+
string errorOutput = process.StandardError.ReadToEnd();
113+
process.WaitForExit();
114+
115+
if (process.ExitCode != 0)
116+
throw new Exception($"Tesseract process failed with exit code {process.ExitCode}. Error: {errorOutput}");
117+
118+
if (!File.Exists(tempHocrFile))
119+
throw new Exception("HOCR output file not found. Tesseract might have failed or not produced output.");
120+
121+
hocrText = File.ReadAllText(tempHocrFile);
122+
}
123+
124+
// Clean up temp files
125+
if (File.Exists(tempImageFile)) File.Delete(tempImageFile);
126+
if (File.Exists(tempHocrFile)) File.Delete(tempHocrFile);
127+
128+
if (string.IsNullOrEmpty(hocrText))
129+
throw new Exception("HOCR text could not be generated or was empty.");
130+
131+
var ocrLayoutResult = new OCRLayoutResult();
132+
BuildOCRLayoutResult(ocrLayoutResult, hocrText, imageWidth, imageHeight);
133+
ocrLayoutResult.ImageWidth = imageWidth;
134+
ocrLayoutResult.ImageHeight = imageHeight;
135+
136+
return ocrLayoutResult;
137+
}
138+
139+
void BuildOCRLayoutResult(OCRLayoutResult ocr, string hOcrText, float imageWidth, float imageHeight)
140+
{
141+
var doc = XDocument.Parse(hOcrText, LoadOptions.None);
142+
var ns = "http://www.w3.org/1999/xhtml";
143+
144+
foreach (var pageElement in doc.Descendants(ns + "div").Where(d => d.Attribute("class")?.Value == "ocr_page"))
145+
{
146+
Page ocrPage = new Page();
147+
148+
foreach (var lineElement in pageElement.Descendants(ns + "span")
149+
.Where(s => s.Attribute("class")?.Value == "ocr_line" || s.Attribute("class")?.Value == "ocr_header"))
150+
{
151+
Line ocrLine = new Line();
152+
153+
foreach (var wordElement in lineElement.Descendants(ns + "span")
154+
.Where(s => s.Attribute("class")?.Value == "ocrx_word"))
155+
{
156+
Word ocrWord = new Word { Text = wordElement.Value };
157+
String title = wordElement.Attribute("title")?.Value;
158+
159+
if (title != null)
160+
{
161+
String bboxString = title.Split(';')[0].Replace("bbox", "").Trim();
162+
int[] coords = bboxString.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(int.Parse).ToArray();
163+
164+
if (coords.Length == 4)
165+
{
166+
float x = coords[0];
167+
float y = coords[1];
168+
float width = coords[2] - coords[0];
169+
float height = coords[3] - coords[1];
170+
ocrWord.Rectangle = new RectangleF(x, y, width, height);
171+
}
172+
}
173+
174+
ocrLine.Add(ocrWord);
175+
}
176+
177+
ocrPage.Add(ocrLine);
178+
}
179+
180+
ocr.Add(ocrPage);
181+
}
182+
}
183+
}
184+
}

OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# See https://aka.ms/customizecontainer to learn how to customize your debug container and how Visual Studio uses this Dockerfile to build your images for faster debugging.
22

3+
34
# This stage is used when running from VS in fast mode (Default for Debug configuration)
45
FROM mcr.microsoft.com/dotnet/runtime:8.0 AS
56
RUN apt-get update && apt-get install -y tesseract-ocr
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
namespace OCR_with_Tesseract_in_Docker_on_Linux.Models
2+
{
3+
public class ErrorViewModel
4+
{
5+
public string? RequestId { get; set; }
6+
7+
public bool ShowRequestId => !string.IsNullOrEmpty(RequestId);
8+
}
9+
}
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,17 @@
1-
<Project Sdk="Microsoft.NET.Sdk">
1+
<Project Sdk="Microsoft.NET.Sdk.Web">
22

33
<PropertyGroup>
4-
<OutputType>Exe</OutputType>
54
<TargetFramework>net8.0</TargetFramework>
6-
<RootNamespace>OCR-with-Tesseract-in-Docker-on-Linux</RootNamespace>
7-
<ImplicitUsings>enable</ImplicitUsings>
85
<Nullable>enable</Nullable>
6+
<ImplicitUsings>enable</ImplicitUsings>
7+
<RootNamespace>OCR_with_Tesseract_in_Docker_on_Linux</RootNamespace>
8+
<UserSecretsId>16743565-eaf2-4e18-8eb6-e6ba08388c1f</UserSecretsId>
99
<DockerDefaultTargetOS>Linux</DockerDefaultTargetOS>
1010
</PropertyGroup>
1111

1212
<ItemGroup>
1313
<PackageReference Include="Microsoft.VisualStudio.Azure.Containers.Tools.Targets" Version="1.22.1" />
14-
<PackageReference Include="SkiaSharp.NativeAssets.Linux.NoDependencies" Version="*" />
15-
<PackageReference Include="Syncfusion.PDF.OCR.Net.Core" Version="*" />
16-
</ItemGroup>
17-
18-
<ItemGroup>
19-
<None Update="Data\Input.pdf">
20-
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
21-
</None>
14+
<PackageReference Include="Syncfusion.PDF.OCR.Net.Core" Version="31.2.3" />
2215
</ItemGroup>
2316

2417
</Project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3+
<PropertyGroup>
4+
<ActiveDebugProfile>IIS Express</ActiveDebugProfile>
5+
</PropertyGroup>
6+
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
7+
<DebuggerFlavor>ProjectDebugger</DebuggerFlavor>
8+
</PropertyGroup>
9+
</Project>

OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.user

Lines changed: 0 additions & 6 deletions
This file was deleted.

OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/Output/.gitkeep

Whitespace-only changes.

0 commit comments

Comments
 (0)