Intro
This time, I will try reading a word processing file(MS Word file) by OpenXML.
I will use MS Word template to check the result.
Environments
- .NET ver.9.0.100
- DocumentFormat.OpenXml ver.3.1.1
- NLog.Web.AspNetCore ver.5.3.14
- Microsoft.AspNetCore.SpaServices.Extensions ver.9.0.0
Reading
I can get stream from IFormFile what is sent from the client-side and read as "WordprocessingDocument".
DocFileReader.cs
using DocumentFormat.OpenXml; using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Wordprocessing; namespace OfficeFileAccessor.OfficeFiles.Readers; public class DocFileReader: IOfficeFileReader { private readonly NLog.Logger logger; public DocFileReader() { this.logger = NLog.LogManager.GetCurrentClassLogger(); } public void Read(IFormFile file) { // Open file stream and read using WordprocessingDocument wordDoc = WordprocessingDocument.Open(file.OpenReadStream(), false); Body? body = wordDoc.MainDocumentPart?.Document?.Body; if(body == null) { logger.Warn("Failed reading the document"); return; } // Get all elements as XML from body foreach(OpenXmlElement elm in body.Elements()) { logger.Info($"Type: {elm.GetType()} XML: {elm.InnerXml}"); } } }
Result
Type: DocumentFormat.OpenXml.Wordprocessing.Paragraph XML: <w:pPr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:pStyle w:val="a6" /><w:jc w:val="left" /></w:pPr><w:r xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:rPr><w:noProof /></w:rPr><mc:AlternateContent xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"><mc:Choice Requires="wps"><w:drawing><wp:anchor distT="0" distB="0" distL="114300" distR="114300" simplePos="0" relativeHeight="251659264" behindDoc="0" locked="0" layoutInCell="1" allowOverlap="1" wp14:editId="2621265D" wp14:anchorId="3EB8B806" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"><wp:simplePos x="0" y="0" /><wp:positionH relativeFrom="column"><wp:posOffset>485140</wp:posOffset></wp:positionH><wp:positionV relativeFrom="paragraph"><wp:posOffset>329565</wp:posOffset></wp:positionV><wp:extent cx="1495425" cy="604837" /><wp:effectExtent l="0" t="0" r="28575" b="24130" /><wp:wrapNone /> ...
I also can get inner text and style information for each element type.
Table
DocFileReader.cs
... public void Read(IFormFile file) { ... foreach(OpenXmlElement elm in body.Elements()) { if(elm is Table table) { logger.Info("Table found:"); ReadTableProps(table); } } } private void ReadTableProps(Table table) { // Get Table properties TableProperties? tableProperties = table.GetFirstChild<TableProperties>(); if(tableProperties != null) { // Table width TableWidth? tableWidth = tableProperties.GetFirstChild<TableWidth>(); logger.Info($"Table Width: {tableWidth?.Width}"); // Table borders TableBorders? borders = tableProperties.GetFirstChild<TableBorders>(); if(borders != null) { logger.Info($"Table Border Left Val: {borders.LeftBorder?.Val} Color: {borders.LeftBorder?.Color} Size: {borders.LeftBorder?.Size}"); logger.Info($"Table Border Top Val: {borders.TopBorder?.Val} Color: {borders.TopBorder?.Color} Size: {borders.TopBorder?.Size}"); } } // Get rows var rows = table.Elements<TableRow>(); foreach (var row in rows) { // Get row properties TableRowProperties? rowProperties = row.GetFirstChild<TableRowProperties>(); if(rowProperties != null) { TableRowHeight? rowHeight = rowProperties.GetFirstChild<TableRowHeight>(); logger.Info($"Row Height: {rowHeight?.Val}"); } // Get cells var cells = row.Elements<TableCell>(); foreach (var cell in cells) { // Get cell texts string cellText = cell.InnerText; logger.Info($"CELL Text: {cellText}"); // Get cell properties TableCellProperties? cellProperties = cell.GetFirstChild<TableCellProperties>(); if(cellProperties != null) { TableCellWidth? cellWidth = cellProperties.GetFirstChild<TableCellWidth>(); logger.Info($"Cell Width: {cellWidth?.Width}"); TableCellBorders? borders = cellProperties.GetFirstChild<TableCellBorders>(); if(borders != null) { logger.Info($"Cell Border Right Val: {borders.RightBorder?.Val} Color: {borders.RightBorder?.Color} Size: {borders.RightBorder?.Size}"); logger.Info($"Cell Border Bottom Val: {borders.BottomBorder?.Val} Color: {borders.BottomBorder?.Color} Size: {borders.BottomBorder?.Size}"); } // Get colors Shading? shading = cellProperties.GetFirstChild<Shading>(); if(shading != null) { logger.Info($"Cell BackgroundColor: {shading.Fill?.Value} Color:{shading.Color}"); } } } logger.Info("-----------"); } logger.Info("\n"); } }
Top comments (0)