DEV Community

Masui Masanori
Masui Masanori

Posted on

[ASP.NET Core] Try reading a spreadsheet file by OpenXML 1

Intro

This time, I will try reading a spreadsheet file(MS Excel file) by DocumentFormat.OpenXml.
I will use this file to check the result.

Image description

Getting cells

To load a spreadsheet using OpenXML, I will get a WorkbookPart from the spreadsheet file first.
And then I will get a WorksheetPart from it, finally I will be able to get Worksheets from the WorksheetPart.

XlsFileReader.cs

using DocumentFormat.OpenXml; using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Spreadsheet; using DocumentFormat.OpenXml.Drawing.Spreadsheet; using Drawing = DocumentFormat.OpenXml.Drawing; using OfficeFileAccessor.Apps; using System.Text.RegularExpressions; namespace OfficeFileAccessor.OfficeFiles.Readers; public class XlsFileReader(ILogger<XlsFileReader> Logger) : IXlsFileReader { ... public void Read(IFormFile file) { using SpreadsheetDocument spreadsheet = SpreadsheetDocument.Open(file.OpenReadStream(), false); WorkbookPart? bookPart = spreadsheet.WorkbookPart; if(bookPart == null) { return; } List<string> sheetNames = GetSheetNameList(bookPart); foreach(var name in sheetNames) { Worksheet? targetSheet = GetWorksheet(bookPart, name); if(targetSheet == null) { return; } WorksheetPart? sheetPart = targetSheet.WorksheetPart; foreach(Row row in targetSheet.Descendants<Row>()) { foreach(Cell cell in row.Cast<Cell>()) { // Get cell values Worksheets.Cell? cellValue = GetCellValue(bookPart, cell); } } // Load first sheet for testing. break; } } ... private static List<string> GetSheetNameList(WorkbookPart bookPart) => [.. bookPart.Workbook.Descendants<Sheet>().Where(s => string.IsNullOrEmpty(s.Name) == false).Select(s => s.Name?.Value ?? "")]; private static Worksheet? GetWorksheet(WorkbookPart bookPart, string sheetName) { foreach(Sheet s in bookPart.Workbook.Descendants<Sheet>()) { if(s.Name == sheetName && string.IsNullOrEmpty(s.Id) == false) { if(bookPart.TryGetPartById(s.Id!, out var part)) { if (part is WorksheetPart result) { return result.Worksheet; } } } } return null; } ... } 
Enter fullscreen mode Exit fullscreen mode

Getting cell values

After getting cell instances, I can get values from them.

Text

Cell.cs

namespace OfficeFileAccessor.OfficeFiles.Worksheets; public class Cell { public required string Address { get; init; } public required CellValueType Type { get; init; } public required string Value { get; init; } public string? Formula { get; init; } ... } 
Enter fullscreen mode Exit fullscreen mode

XlsFileReader.cs

... private static Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell) { // Get value string value = cell.InnerText; // if the data type is SharedString, find the value from Shared String Table if (cell.DataType != null && cell.DataType.Value == CellValues.SharedString) { SharedStringTablePart? sharedStringTablePart = bookPart.GetPartsOfType<SharedStringTablePart>() ?.FirstOrDefault(); if (sharedStringTablePart != null) { OpenXmlElement sharedStringItem = sharedStringTablePart.SharedStringTable .ElementAt(int.Parse(value)); // Concatenate all text except phonetic reading string result = string.Concat( sharedStringItem.Descendants<DocumentFormat.OpenXml.Spreadsheet.Text>() .Where(t => CheckIsPhonetic(t) == false) .Select(t => t.Text) ); return new Worksheets.Cell { Address = cell.CellReference?.Value ?? "A1", Type = Worksheets.CellValueType.Text, Value = result, }; } } Worksheets.CellValueType valueType = Worksheets.CellValueType.Text; return new Worksheets.Cell { Address = cell.CellReference?.Value ?? "A1", Type = valueType, Value = value, }; } /// <summary> /// Check if the parent element is "PhoneticRun" /// </summary> /// <param name="textElement"></param> /// <returns></returns> private static bool CheckIsPhonetic(DocumentFormat.OpenXml.Spreadsheet.Text textElement) { return textElement.Ancestors<PhoneticRun>().Any(); } ... 
Enter fullscreen mode Exit fullscreen mode

SharedStringTablePart

When I get the cell value from the SharedStringTablePart, if I get it from InnerText, the phonetic will also be combined.

example
  • Cell value: 建築物に関する事項(集約版)(参考様式)
  • InnerText: 建築物に関する事項(集約版)(参考様式)ケンチクブツシュウヤクバンサンコウヨウシキ

So I should filter the values.

XlsFileReader.cs

... private static Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell) { ... OpenXmlElement sharedStringItem = sharedStringTablePart.SharedStringTable .ElementAt(int.Parse(value)); // Concatenate all text except phonetic reading string result = string.Concat( sharedStringItem.Descendants<DocumentFormat.OpenXml.Spreadsheet.Text>() .Where(t => CheckIsPhonetic(t) == false) .Select(t => t.Text) ); ... } ... private static bool CheckIsPhonetic(DocumentFormat.OpenXml.Spreadsheet.Text textElement) { return textElement.Ancestors<PhoneticRun>().Any(); } ... 
Enter fullscreen mode Exit fullscreen mode

Double

I can get numeric values as same as getting texts.
But some values may not be correct if they have decimal points.

XlsFileReader.cs

... Worksheets.CellValueType valueType = Worksheets.CellValueType.Text; if (string.IsNullOrEmpty(value) == false && double.TryParse(value, out double nv)) { valueType = Worksheets.CellValueType.Double; value = nv.ToString("G"); } return new Worksheets.Cell { Address = cell.CellReference?.Value ?? "A1", Type = valueType, Value = value, }; ... 
Enter fullscreen mode Exit fullscreen mode

Formula

XlsFileReader.cs

... // Formula string? formula = cell.CellFormula?.Text; // Get calculation result string? calcResult = cell.CellValue?.InnerText; if(string.IsNullOrEmpty(formula) == false && string.IsNullOrEmpty(calcResult) == false) { if (double.TryParse(calcResult, out double n)) { calcResult = n.ToString("G"); } return new Worksheets.Cell { Address = cell.CellReference?.Value ?? "A1", Type = Worksheets.CellValueType.Formula, Value = calcResult, Formula = formula, }; } ... 
Enter fullscreen mode Exit fullscreen mode

Getting cell styles

Border

Get cell borders from WorkbookStylesPart.

Cell.cs

namespace OfficeFileAccessor.OfficeFiles.Worksheets; public class Cell { public required string Address { get; init; } public required CellValueType Type { get; init; } public required string Value { get; init; } public string? Formula { get; init; } public required double Width { get; init; } public required double Height { get; init; } public string? BackgroundColor { get; init; } public required CellBorders Borders { get; init; } ... } 
Enter fullscreen mode Exit fullscreen mode

CellBorders.cs

namespace OfficeFileAccessor.OfficeFiles.Worksheets; public record CellBorders { public BorderType Left { get; init; } public BorderType Top { get; init; } public BorderType Right { get; init; } public BorderType Bottom { get; init; } ... public static CellBorders GetNoBorders() { return new () { Left = BorderType.None, Top = BorderType.None, Right = BorderType.None, Bottom = BorderType.None, }; } } 
Enter fullscreen mode Exit fullscreen mode

BorderType.cs

namespace OfficeFileAccessor.OfficeFiles.Worksheets; public enum BorderType { None = 0, Thin, } public static class BorderTypeFactory { public static BorderType Get(string? borderValue) { if(string.IsNullOrEmpty(borderValue)) { return BorderType.None; } return borderValue.ToLower() switch { "thin" => BorderType.Thin, _ => BorderType.None, }; } } 
Enter fullscreen mode Exit fullscreen mode

XlsFileReader.cs

... private Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell, double width, double height) { // Borders Worksheets.CellBorders borders = GetBorders(bookPart, cell); ... return new Worksheets.Cell { Address = cell.CellReference?.Value ?? "A1", Type = valueType, Value = value, Borders = borders, }; } ... private static Worksheets.CellBorders GetBorders(WorkbookPart bookPart, Cell cell) { if(cell.StyleIndex?.Value == null) { return Worksheets.CellBorders.GetNoBorders(); } CellFormat? cellFormat = bookPart.WorkbookStylesPart?.Stylesheet?.CellFormats?.ElementAt((int)cell.StyleIndex.Value) as CellFormat; if(cellFormat?.BorderId?.Value != null) { Border? border = bookPart.WorkbookStylesPart?.Stylesheet?.Borders?.ElementAt( (int)cellFormat.BorderId.Value) as Border; if(border != null) { return new () { Left = Worksheets.BorderTypeFactory.Get(border?.LeftBorder?.Style?.InnerText), Top = Worksheets.BorderTypeFactory.Get(border?.TopBorder?.Style?.InnerText), Right = Worksheets.BorderTypeFactory.Get(border?.RightBorder?.Style?.InnerText), Bottom = Worksheets.BorderTypeFactory.Get(border?.BottomBorder?.Style?.InnerText), }; } } return Worksheets.CellBorders.GetNoBorders(); } ... 
Enter fullscreen mode Exit fullscreen mode

Background color

XlsFileReader.cs

... private static Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell) { // Cell color string? backgroundColor = GetCellColor(cell, bookPart); ... return new Worksheets.Cell { Address = cell.CellReference?.Value ?? "A1", Type = valueType, Value = value, BackgroundColor = backgroundColor, Borders = borders, }; } ... private static string? GetCellColor(Cell cell, WorkbookPart bookPart) { uint? styleIndex = cell.StyleIndex?.Value; if(styleIndex == null) { return null; } CellFormat? cellFormat = bookPart.WorkbookStylesPart?.Stylesheet?.CellFormats?.ElementAt((int)styleIndex) as CellFormat; if (cellFormat?.FillId != null) { Fill? fill = bookPart.WorkbookStylesPart?.Stylesheet?.Fills?.ElementAt((int)cellFormat.FillId.Value) as Fill; PatternFill? patternFill = fill?.PatternFill; string? rgbColor = GetRgbColor(patternFill?.ForegroundColor?.Rgb); if(string.IsNullOrEmpty(rgbColor) == false) { return rgbColor; } string? themeColor = GetThemeColor(bookPart, patternFill?.ForegroundColor?.Theme?.Value); if(string.IsNullOrEmpty(themeColor) == false) { return themeColor; } } return null; } private static string? GetRgbColor(HexBinaryValue? rgb) { if(rgb?.InnerText == null) { return null; } // Remove alpha value return rgb.InnerText[2..]; } private static string? GetThemeColor(WorkbookPart bookPart, uint? themeColorIndex) { if(themeColorIndex == null || themeColorIndex <= 0) { return null; } ThemePart? themePart = bookPart.ThemePart; Drawing.Theme? theme = themePart?.Theme; if(theme != null) { Drawing.Color2Type? color2Type = theme.ThemeElements?.ColorScheme?.ElementAt((int)themeColorIndex) as Drawing.Color2Type; return color2Type?.RgbColorModelHex?.Val; } return null; } ... 
Enter fullscreen mode Exit fullscreen mode

Width, Height

By default, cells don't have own widths and heights.
So I should get them from columns and rows.

PrintArea

To get column widths, I should get PrintArea first to get last column index.

PageArea.cs

namespace OfficeFileAccessor.OfficeFiles.Worksheets; public record PageArea(CellAddress Start, CellAddress End); 
Enter fullscreen mode Exit fullscreen mode

XlsFileReader.cs

... private List<Worksheets.PageArea> GetPrintArea(WorkbookPart bookPart, Worksheet worksheet) { DefinedNames? definedNames = bookPart.Workbook.DefinedNames; if(definedNames == null) { return []; } List<Worksheets.PageArea> results = []; foreach (DefinedName definedName in definedNames.Elements<DefinedName>()) { if(string.IsNullOrEmpty(definedName.Name?.Value)) { continue; } if (definedName.Name.Value.StartsWith("_xlnm.Print_Area")) { string sheetName = "default sheet"; if(definedName.LocalSheetId != null) { Sheet? sheet = bookPart.Workbook.Sheets?.Elements<Sheet>() ?.FirstOrDefault(s => s.SheetId?.Value != null && s.SheetId.Value == definedName.LocalSheetId.Value + 1); if(sheet?.Name != null) { sheetName = sheet.Name!; } } string printAreaValue = definedName.Text; // SheetName is like SheetName!$A$1:$Z$20 string[] ranges = printAreaValue.Split('!'); foreach(var r in ranges) { string[] addresses = r.Split(":"); if(addresses.Length < 2) { continue; } (string columnNameStart, int rowStart) = GetCellAddress(addresses[0]); if(string.IsNullOrEmpty(columnNameStart) || rowStart <= 0) { continue; } Worksheets.CellAddress startAddress = new (columnNameStart, ConvertAlphabetToIndex(columnNameStart), rowStart); (string columnNameEnd, int rowEnd) = GetCellAddress(addresses[1]); if(string.IsNullOrEmpty(columnNameEnd) || rowEnd <= 0) { continue; } Worksheets.CellAddress endAddress = new (columnNameEnd, ConvertAlphabetToIndex(columnNameEnd), rowEnd); results.Add(new (startAddress, endAddress)); } } } return results; } ... 
Enter fullscreen mode Exit fullscreen mode

Width, Height

XlsFileReader.cs

... public class XlsFileReader(ILogger<XlsFileReader> Logger) : IXlsFileReader { private readonly double DefaultWidth = Numbers.ConvertFromPixelToCentimeter(8.38 * 7.0); private readonly double DefaultHeight = Numbers.ConvertFromPointToCentimeter(18.75); private static readonly Regex CellAddressRegex = new (@"\$([a-zA-Z]+)\$([0-9]+)"); private static readonly Regex ColumnNameRegex = new ("([a-zA-Z]+)"); public void Read(IFormFile file) { using SpreadsheetDocument spreadsheet = SpreadsheetDocument.Open(file.OpenReadStream(), false); WorkbookPart? bookPart = spreadsheet.WorkbookPart; List<string> sheetNames = GetSheetNameList(bookPart); foreach(var name in sheetNames) { Worksheet? targetSheet = GetWorksheet(bookPart, name); if(targetSheet == null) { return; } List<Worksheets.PageArea> printAreas = GetPrintArea(bookPart, targetSheet); Worksheets.PageArea? target = printAreas.FirstOrDefault(); List<Worksheets.ColumnWidth> widths = []; if(target != null) { widths = GetColumnWidths(targetSheet, target.Start.Column, target.End.Column); } foreach(Row row in targetSheet.Descendants<Row>()) { double height = DefaultHeight; if(row.Height?.Value != null) { // The row.Height value is expressed in points // convert it to centimeters height = Numbers.ConvertFromPointToCentimeter(row.Height.Value); } foreach(Cell cell in row.Cast<Cell>()) { string columnName = GetColumnNameFromAddress(cell.CellReference); double? width = widths.FirstOrDefault(w => w.ColumnName == columnName)?.Width; width ??= DefaultWidth; Worksheets.Cell? cellValue = GetCellValue(bookPart, cell, (double)width, height); } } // Load first sheet for testing. break; } } private static Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell, double width, double height) { ... return new Worksheets.Cell { Address = cell.CellReference?.Value ?? "A1", Type = valueType, Value = value, Width = width, Height = height, BackgroundColor = backgroundColor, Borders = borders, }; } ... private List<Worksheets.ColumnWidth> GetColumnWidths(Worksheet sheet, int startColumn, int lastColumn) { Columns? columns = sheet.Descendants<Columns>().FirstOrDefault(); if (columns == null) { return []; } List<Worksheets.ColumnWidth> results = []; for (int i = startColumn; i <= lastColumn; i++) { double columnWidth = DefaultWidth; if (columns != null) { uint idx = (uint)i; Column? column = columns.Elements<Column>().FirstOrDefault(c => c?.Min != null && c.Max != null && c.Min <= idx && c.Max >= idx); if (column?.Width != null) { // The column.width value represents the number of characters (7 pixels by default) // convert it to centimeters. columnWidth = Numbers.ConvertFromPixelToCentimeter(column.Width * 7.0); } } results.Add(new Worksheets.ColumnWidth(i, ConvertIndexToAlphabet(i), columnWidth)); } return results; } ... 
Enter fullscreen mode Exit fullscreen mode

Numbers.cs

namespace OfficeFileAccessor.Apps; public static class Numbers { private const double EmuToCm = 1.0 / 360000.0; private const double PointToCm = 0.0352778; private const double PixelToCm = 0.0264583; public static int ParseInt(string? value, int defaultValue) { if(string.IsNullOrEmpty(value)) { return defaultValue; } if(int.TryParse(value, out var result)) { return result; } return defaultValue; } public static double ConvertFromEMUToCentimeter(int emuValue) { return EmuToCm * (double)emuValue; } public static double ConvertFromPointToCentimeter(double pointValue) { return PointToCm * pointValue; } public static double ConvertFromPixelToCentimeter(double pixelValue) { return PixelToCm * pixelValue; } } 
Enter fullscreen mode Exit fullscreen mode

Top comments (0)