GetTablePages.cs
// // This code is part of Document Solutions for PDF demos. // Copyright (c) MESCIUS inc. All rights reserved. // using System; using System.IO; using System.Drawing; using System.Linq; using System.Collections.Generic; using GrapeCity.Documents.Pdf; using GrapeCity.Documents.Pdf.Recognition; using GrapeCity.Documents.Text; using GrapeCity.Documents.Common; using GCTEXT = GrapeCity.Documents.Text; using GCDRAW = GrapeCity.Documents.Drawing; namespace DsPdfWeb.Demos { // Extract data from a table. public class GetTablePages { public int CreatePDF(Stream stream) { const float DPI = 72; const float margin = 36; var doc = new GcPdfDocument(); var tf = new TextFormat() { Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "NotoSans-Regular.ttf")), FontSize = 9, ForeColor = Color.Black }; var tfHdr = new TextFormat(tf) { Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "NotoSans-Bold.ttf")), FontSize = 11, ForeColor = Color.DarkBlue }; var tfRed = new TextFormat(tf) { ForeColor = Color.Red }; using (var fs = File.OpenRead(Path.Combine("Resources", "PDFs", "product-list.pdf"))) { var page = doc.NewPage(); page.Landscape = true; var g = page.Graphics; var rc = Common.Util.AddNote( "This sample loads a PDF that contains a table split between several pages (a product price list), " + "and extracts the tables on all pages using the Page.GetTable() method. " + "The extracted data is printed as a list of rows and cells. " + "The source product list PDF is appended to the generated document for reference.", page, new RectangleF(margin, margin, page.Bounds.Width - margin * 2, page.Bounds.Height - margin * 2)); var tl = g.CreateTextLayout(); tl.MaxWidth = page.Bounds.Width; tl.MaxHeight = page.Bounds.Height; tl.MarginAll = margin; tl.MarginTop = rc.Bottom; tl.DefaultTabStops = 165; var docSrc = new GcPdfDocument(); docSrc.Load(fs); for (int i = 0; i < docSrc.Pages.Count; ++i) { // TableExtractOptions allow you to fine-tune table recognition accounting for // specifics of the table formatting: var teo = new TableExtractOptions(); var GetMinimumDistanceBetweenRows = teo.GetMinimumDistanceBetweenRows; // In this particular case, we slightly increase the minimum distance between rows // to make sure cells with wrapped text are not mistaken for two cells: teo.GetMinimumDistanceBetweenRows = (list) => { var res = GetMinimumDistanceBetweenRows(list); return res * 1.2f; }; var top = i == 0 ? DPI * 2 : DPI; // Get the table at the specified bounds: var itable = docSrc.Pages[i].GetTable(new RectangleF(DPI * 0.25f, top, DPI * 8, DPI * 10.5f - top), teo); // Add table data to the text layout: tl.Append($"\nTable on page {i + 1} of the source document has {itable.Cols.Count} column(s) and {itable.Rows.Count} row(s), table data is:", tfHdr); tl.AppendParagraphBreak(); for (int row = 0; row < itable.Rows.Count; ++row) { var tfmt = row == 0 ? tfHdr : tf; for (int col = 0; col < itable.Cols.Count; ++col) { var cell = itable.GetCell(row, col); if (col > 0) tl.Append("\t", tfmt); if (cell == null) tl.Append("<no cell>", tfRed); else tl.Append(cell.Text, tfmt); } tl.AppendLine(); } } // Print the extracted data: var to = new TextSplitOptions(tl) { RestMarginTop = margin, MinLinesInFirstParagraph = 2, MinLinesInLastParagraph = 2 }; tl.PerformLayout(true); while (true) { var splitResult = tl.Split(to, out TextLayout rest); doc.Pages.Last.Graphics.DrawTextLayout(tl, PointF.Empty); if (splitResult != SplitResult.Split) break; tl = rest; doc.NewPage().Landscape = true; } // Append the original document for reference: doc.MergeWithDocument(docSrc); doc.Save(stream); return doc.Pages.Count; } } } }