Skip to content

Commit 28bd57f

Browse files
committed
update PdfPig
add RemoveDuplicateLetters
1 parent 564d716 commit 28bd57f

File tree

6 files changed

+84
-22
lines changed

6 files changed

+84
-22
lines changed

DocumentLayoutAnalysis/DlaViewer/DlaViewer.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
<ItemGroup>
1010
<PackageReference Include="OxyPlot.Wpf" Version="2.0.0" />
11-
<PackageReference Include="PdfPig" Version="0.1.2-alpha003" />
11+
<PackageReference Include="PdfPig" Version="0.1.2" />
1212
</ItemGroup>
1313

1414
<ItemGroup>

DocumentLayoutAnalysis/DlaViewer/MainViewModel.cs

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,48 @@ public bool ClipPaths
223223
}
224224
}
225225

226+
private bool _removeDuplicateLetters;
227+
public bool RemoveDuplicateLetters
228+
{
229+
get
230+
{
231+
return _removeDuplicateLetters;
232+
}
233+
234+
set
235+
{
236+
if (value == _removeDuplicateLetters) return;
237+
_removeDuplicateLetters = value;
238+
239+
if (_pdfPageModel != null)
240+
{
241+
_pdfPageModel.SetRemoveDuplicateLetters(_removeDuplicateLetters);
242+
}
243+
244+
if (IsDisplayLetters)
245+
{
246+
DisplayLetters();
247+
}
248+
249+
if (IsDisplayWords)
250+
{
251+
DisplayWords();
252+
}
253+
254+
if (IsDisplayTextLines)
255+
{
256+
DisplayTextLines();
257+
}
258+
259+
if (IsDisplayTextBlocks)
260+
{
261+
DisplayTextBlocks();
262+
}
263+
264+
this.RaisePropertyChanged(nameof(RemoveDuplicateLetters));
265+
}
266+
}
267+
226268
bool _isDisplayLetters;
227269
public bool IsDisplayLetters
228270
{
@@ -423,6 +465,9 @@ private bool LoadPage(int pageNo)
423465

424466
if (_pdfPageModel == null) return false;
425467

468+
// set remove duplicate letters
469+
_pdfPageModel.SetRemoveDuplicateLetters(_removeDuplicateLetters);
470+
426471
// set word extractor
427472
_pdfPageModel.SetWordExtractor(WordExtractor);
428473

DocumentLayoutAnalysis/DlaViewer/MainWindow.xaml

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
<Grid>
1111
<Grid.ColumnDefinitions>
1212
<ColumnDefinition Width="600*" />
13-
<ColumnDefinition Width="257*" />
13+
<ColumnDefinition Width="255" />
1414
</Grid.ColumnDefinitions>
1515

1616
<Grid.RowDefinitions>
@@ -40,23 +40,30 @@
4040
<Button ToolTip="Next page" Content=">" HorizontalAlignment="Left" Margin="139,8,0,0" VerticalAlignment="Top" Width="30" Click="buttonNext_Click" Height="20"/>
4141
<GroupBox Grid.Column="1" Header="Segmentation" Grid.Row="2">
4242
<Grid>
43+
<Grid.ColumnDefinitions>
44+
<ColumnDefinition Width="202*"/>
45+
<ColumnDefinition Width="27"/>
46+
</Grid.ColumnDefinitions>
4347
<Grid.RowDefinitions>
44-
<RowDefinition Height="95*"/>
45-
<RowDefinition Height="65*"/>
46-
<RowDefinition Height="190"/>
47-
<RowDefinition/>
48+
<RowDefinition Height="65"/>
49+
<RowDefinition Height="55"/>
50+
<RowDefinition Height="225*"/>
4851
</Grid.RowDefinitions>
4952

50-
<ComboBox SelectedItem="{Binding WordExtractor}" SelectedIndex="0" ItemsSource="{Binding WordExtractorList}" Margin="5,10,10,0" VerticalAlignment="Top" Height="22"/>
51-
<ComboBox SelectedItem="{Binding PageSegmenter}" SelectedIndex="0" ItemsSource="{Binding PageSegmenterList}" Margin="5,0,10,0" VerticalAlignment="Center" Height="22"/>
53+
<ComboBox SelectedItem="{Binding WordExtractor}" SelectedIndex="0" ItemsSource="{Binding WordExtractorList}" Margin="5,10,0,0" VerticalAlignment="Top" Height="22"/>
54+
<Button x:Name="WordExtractorOptionButton" Content="o" Margin="5,10,5,0" VerticalAlignment="Top" Grid.Column="1" Height="22"/>
55+
56+
<ComboBox SelectedItem="{Binding PageSegmenter}" SelectedIndex="0" ItemsSource="{Binding PageSegmenterList}" Margin="5,37,0,0" Height="22" VerticalAlignment="Top"/>
57+
<Button Content="o" Margin="5,37,5,0" VerticalAlignment="Top" Grid.Column="1" Height="22"/>
5258

53-
<GroupBox Header="Options" Margin="0,0,0,0" Grid.Row="1">
59+
<GroupBox Header="Options" Grid.ColumnSpan="2" Grid.Row="1" Margin="0,0,0,1">
5460
<Grid>
5561
<CheckBox IsChecked="{Binding ClipPaths}" Content="Clip paths" HorizontalAlignment="Left" Margin="5,10,0,0" VerticalAlignment="Top" ToolTip="Will reload the document"/>
62+
<CheckBox IsChecked="{Binding RemoveDuplicateLetters}" Content="Remove Dupl. Letters" HorizontalAlignment="Left" Margin="83,10,0,0" VerticalAlignment="Top" Width="138" ToolTip="Remove Duplicate Overlapping Letters"/>
5663
</Grid>
5764
</GroupBox>
5865

59-
<GroupBox Header="Display" Grid.Row="2" Margin="0,0,0,0">
66+
<GroupBox Header="Display" Grid.Row="2" Grid.ColumnSpan="2">
6067
<Grid>
6168
<Grid.ColumnDefinitions>
6269
<ColumnDefinition Width="175*"/>
@@ -71,10 +78,9 @@
7178
<CheckBox Content="Tables" HorizontalAlignment="Left" Margin="5,130,0,0" VerticalAlignment="Top" />
7279
</Grid>
7380
</GroupBox>
74-
7581
</Grid>
7682
</GroupBox>
7783

78-
<Label Name="DragDropLabel" Content="Drag and Drop pdf document here..." Margin="172,156,161,0" Grid.Row="1" Height="48" VerticalAlignment="Top"/>
84+
<Label Name="DragDropLabel" Content="Drag and Drop pdf document here..." HorizontalAlignment="Center" VerticalAlignment="Center" HorizontalContentAlignment="Center" VerticalContentAlignment="Center" Grid.Row="1" Height="68"/>
7985
</Grid>
8086
</Window>

DocumentLayoutAnalysis/DlaViewer/PdfPageModel.cs

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ public class PdfPageModel
1414
private readonly Page page;
1515
private IWordExtractor wordExtractor;
1616
private IPageSegmenter pageSegmenter;
17+
private bool removeDuplicateLetters;
1718

1819
public double Height => page.Height;
1920
public double Width => page.Width;
@@ -24,6 +25,11 @@ internal PdfPageModel(Page page)
2425
this.page = page;
2526
}
2627

28+
public void SetRemoveDuplicateLetters(bool remove)
29+
{
30+
removeDuplicateLetters = remove;
31+
}
32+
2733
public void SetWordExtractor(Type wordExtractor)
2834
{
2935
if (wordExtractor == null) return;
@@ -44,8 +50,13 @@ public void SetPageSegmenter(Type pageSegmenter)
4450
this.pageSegmenter = (IPageSegmenter)Activator.CreateInstance(pageSegmenter);
4551
}
4652

47-
public IEnumerable<Letter> GetLetters()
53+
public IReadOnlyList<Letter> GetLetters()
4854
{
55+
if (removeDuplicateLetters)
56+
{
57+
return DuplicateOverlappingTextProcessor.Get(page.Letters);
58+
}
59+
4960
return page.Letters;
5061
}
5162

@@ -56,7 +67,7 @@ public IEnumerable<Word> GetWords()
5667
return new List<Word>();
5768
}
5869

59-
return page.GetWords(wordExtractor);
70+
return wordExtractor.GetWords(GetLetters());
6071
}
6172

6273
public IEnumerable<TextBlock> GetTextBlocks()

DocumentLayoutAnalysis/DocumentLayoutAnalysis/DocumentLayoutAnalysis.csproj

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -53,25 +53,25 @@
5353
<Reference Include="System.Net.Http" />
5454
<Reference Include="System.Xml" />
5555
<Reference Include="UglyToad.PdfPig, Version=0.1.2.0, Culture=neutral, PublicKeyToken=605d367334e74123, processorArchitecture=MSIL">
56-
<HintPath>..\packages\PdfPig.0.1.2-alpha003\lib\net461\UglyToad.PdfPig.dll</HintPath>
56+
<HintPath>..\packages\PdfPig.0.1.2\lib\net461\UglyToad.PdfPig.dll</HintPath>
5757
</Reference>
5858
<Reference Include="UglyToad.PdfPig.Core, Version=0.1.2.0, Culture=neutral, PublicKeyToken=605d367334e74123, processorArchitecture=MSIL">
59-
<HintPath>..\packages\PdfPig.0.1.2-alpha003\lib\net461\UglyToad.PdfPig.Core.dll</HintPath>
59+
<HintPath>..\packages\PdfPig.0.1.2\lib\net461\UglyToad.PdfPig.Core.dll</HintPath>
6060
</Reference>
6161
<Reference Include="UglyToad.PdfPig.DocumentLayoutAnalysis, Version=0.1.2.0, Culture=neutral, PublicKeyToken=605d367334e74123, processorArchitecture=MSIL">
62-
<HintPath>..\packages\PdfPig.0.1.2-alpha003\lib\net461\UglyToad.PdfPig.DocumentLayoutAnalysis.dll</HintPath>
62+
<HintPath>..\packages\PdfPig.0.1.2\lib\net461\UglyToad.PdfPig.DocumentLayoutAnalysis.dll</HintPath>
6363
</Reference>
6464
<Reference Include="UglyToad.PdfPig.Fonts, Version=0.1.2.0, Culture=neutral, PublicKeyToken=605d367334e74123, processorArchitecture=MSIL">
65-
<HintPath>..\packages\PdfPig.0.1.2-alpha003\lib\net461\UglyToad.PdfPig.Fonts.dll</HintPath>
65+
<HintPath>..\packages\PdfPig.0.1.2\lib\net461\UglyToad.PdfPig.Fonts.dll</HintPath>
6666
</Reference>
6767
<Reference Include="UglyToad.PdfPig.Package, Version=0.1.1.0, Culture=neutral, processorArchitecture=MSIL">
68-
<HintPath>..\packages\PdfPig.0.1.2-alpha003\lib\net461\UglyToad.PdfPig.Package.dll</HintPath>
68+
<HintPath>..\packages\PdfPig.0.1.2\lib\net461\UglyToad.PdfPig.Package.dll</HintPath>
6969
</Reference>
7070
<Reference Include="UglyToad.PdfPig.Tokenization, Version=0.1.2.0, Culture=neutral, PublicKeyToken=605d367334e74123, processorArchitecture=MSIL">
71-
<HintPath>..\packages\PdfPig.0.1.2-alpha003\lib\net461\UglyToad.PdfPig.Tokenization.dll</HintPath>
71+
<HintPath>..\packages\PdfPig.0.1.2\lib\net461\UglyToad.PdfPig.Tokenization.dll</HintPath>
7272
</Reference>
7373
<Reference Include="UglyToad.PdfPig.Tokens, Version=0.1.2.0, Culture=neutral, PublicKeyToken=605d367334e74123, processorArchitecture=MSIL">
74-
<HintPath>..\packages\PdfPig.0.1.2-alpha003\lib\net461\UglyToad.PdfPig.Tokens.dll</HintPath>
74+
<HintPath>..\packages\PdfPig.0.1.2\lib\net461\UglyToad.PdfPig.Tokens.dll</HintPath>
7575
</Reference>
7676
</ItemGroup>
7777
<ItemGroup>
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<?xml version="1.0" encoding="utf-8"?>
22
<packages>
3-
<package id="PdfPig" version="0.1.2-alpha003" targetFramework="net461" />
3+
<package id="PdfPig" version="0.1.2" targetFramework="net461" />
44
<package id="System.ValueTuple" version="4.5.0" targetFramework="net461" />
55
</packages>

0 commit comments

Comments
 (0)