diff options
| author | Cary Coutant <ccoutant@gmail.com> | 2023-03-23 17:14:58 -0700 |
|---|---|---|
| committer | Cary Coutant <ccoutant@gmail.com> | 2023-03-23 17:14:58 -0700 |
| commit | 49409d19ef4c6de858905775b3662e14ee735bcd (patch) | |
| tree | a1830713349b8edd0693bd90efb7af269cf9282e /tools | |
| parent | issues/170427.3 Fix more typos found by Pedro Alves (diff) | |
Initial conversion to markdown format.
I've included a python script in tools/html-to-md.py that I used to do this conversion, in case anyone wants to reconstruct it. It does a rudimentary job, slightly tailored to the structure of the existing DWARF HTML pages. After the conversion, I did some manual tweaks, and rearranged a few pages.
Diffstat (limited to 'tools')
| -rwxr-xr-x | tools/html-to-md.py | 613 |
1 files changed, 613 insertions, 0 deletions
diff --git a/tools/html-to-md.py b/tools/html-to-md.py new file mode 100755 index 0000000..ec6202b --- /dev/null +++ b/tools/html-to-md.py | |||
| @@ -0,0 +1,613 @@ | |||
| 1 | #! /usr/bin/python3 | ||
| 2 | |||
| 3 | # html-to-md | ||
| 4 | # Rudimentary conversion of HTML to Markdown. | ||
| 5 | # -h | ||
| 6 | # Strips the header and footer elements from the page. | ||
| 7 | # Looks for the specific patterns used in the DWARF HTML code. | ||
| 8 | # -p | ||
| 9 | # Tailors the conversion to DWARF issue pages, extracting | ||
| 10 | # metadata from the start of the file, and using the | ||
| 11 | # contents of the <pre> element as the markdown. | ||
| 12 | # The metadata is placed at the top of the generated | ||
| 13 | # markdown file. | ||
| 14 | # -v n | ||
| 15 | # Specifies the target DWARF version number (for issue metadata). | ||
| 16 | # Default: 6. | ||
| 17 | |||
| 18 | import sys | ||
| 19 | import re | ||
| 20 | import getopt | ||
| 21 | from html.parser import HTMLParser | ||
| 22 | from html.entities import name2codepoint | ||
| 23 | |||
| 24 | # These tags are treated as block tags; i.e., they are laid out | ||
| 25 | # as blocks on the page. | ||
| 26 | |||
| 27 | block_tags = [ | ||
| 28 | "__root__", | ||
| 29 | "html", "head", "meta", "title", "body", | ||
| 30 | "h1", "h2", "h3", "h4", "h5", "h6", | ||
| 31 | "div", "p", | ||
| 32 | "table", "thead", "tbody", "tr", "th", "td", "caption", | ||
| 33 | "pre", "ul", "ol", "li", "dl", "dt", "dd", | ||
| 34 | "hr" | ||
| 35 | ] | ||
| 36 | |||
| 37 | # These tags have a well-defined structure, with specific | ||
| 38 | # kinds of tags allowed within. Unstructured text inside | ||
| 39 | # these tags will be ignored. | ||
| 40 | |||
| 41 | structured_tags = [ | ||
| 42 | "table", "thead", "tbody", "tr" | ||
| 43 | ] | ||
| 44 | |||
| 45 | # These tags are allowed to contain other block tags. | ||
| 46 | # Used to help decide whether to auto-close the current | ||
| 47 | # tag or nest the new one inside. | ||
| 48 | |||
| 49 | can_contain_block_tags = [ | ||
| 50 | "html", "head", "body", "div", | ||
| 51 | "table", "thead", "tbody", "tr", "th", "td", | ||
| 52 | "pre", "ul", "ol", "li", "dl", "dd" | ||
| 53 | ] | ||
| 54 | |||
| 55 | # These tags are treated as inline content, and may be | ||
| 56 | # contained inside any other (non-structured) tag. | ||
| 57 | |||
| 58 | inline_tags = [ | ||
| 59 | "i", "b", "em", "strong", "code", "br", "img", "a", "small", | ||
| 60 | "button" | ||
| 61 | ] | ||
| 62 | |||
| 63 | # These tags have optional closing tags, and should be | ||
| 64 | # automatically closed when we see a starting tag that | ||
| 65 | # cannot be contained inside it. | ||
| 66 | |||
| 67 | auto_closing_tags = [ | ||
| 68 | "meta", "body", "p", "li", "dt", "dd", "tr", "td", "th", "a" | ||
| 69 | ] | ||
| 70 | |||
| 71 | # These tags do not require a closing tag. | ||
| 72 | |||
| 73 | unbalanced_tags = [ | ||
| 74 | "br", "hr", "img" | ||
| 75 | ] | ||
| 76 | |||
| 77 | # We ignore the following tags for the purposes of conversion. | ||
| 78 | |||
| 79 | ignored_tags = [ | ||
| 80 | "html", "head", "thead", "tbody", "hr", "font" | ||
| 81 | ] | ||
| 82 | |||
| 83 | # These are the heading tags, in order of precedence. | ||
| 84 | |||
| 85 | heading_tags = [ | ||
| 86 | "h1", "h2", "h3", "h4", "h5", "h6" | ||
| 87 | ] | ||
| 88 | |||
| 89 | # Return True if a tag1 element can contain a tag2 element. | ||
| 90 | |||
| 91 | def can_contain(tag1, tag2): | ||
| 92 | if tag1 == "a" and tag2 == "a": | ||
| 93 | return False | ||
| 94 | if tag1 == "li" and tag2 == "li": | ||
| 95 | return False | ||
| 96 | if tag1 in ["dt", "dd"] and tag2 in ["dt", "dd"]: | ||
| 97 | return False | ||
| 98 | if tag1 == "table" and tag2 not in ["tr", "caption"]: | ||
| 99 | return False | ||
| 100 | if tag1 == "tr" and tag2 not in ["th", "td"]: | ||
| 101 | return False | ||
| 102 | if tag2 == "body": | ||
| 103 | return False | ||
| 104 | return tag1 in can_contain_block_tags or tag2 in inline_tags | ||
| 105 | |||
| 106 | # Create a substitution reference for ref and return the inline text. | ||
| 107 | |||
| 108 | def create_sub(ref): | ||
| 109 | # TODO: create a unique substitution reference for ref. | ||
| 110 | return ref | ||
| 111 | |||
| 112 | # Class Tag represents an HTML element. | ||
| 113 | # tagname is the HTML tag. | ||
| 114 | # attrs is a dictionary of the element's attributes. | ||
| 115 | # parent is a pointer to the containing element. | ||
| 116 | # loc is the line number within the HTML source. | ||
| 117 | # aux is a list of auxiliary blocks (e.g., substitution definitions) to write | ||
| 118 | # at the end of the document. | ||
| 119 | |||
| 120 | class Tag: | ||
| 121 | def __init__(self, tagname, attrs, parent, loc): | ||
| 122 | self.tagname = tagname | ||
| 123 | self.attrs = attrs | ||
| 124 | self.parent = parent | ||
| 125 | self.elements = [] | ||
| 126 | self.loc = loc | ||
| 127 | self.aux = [] | ||
| 128 | |||
| 129 | # Return True if we are inside a tagname element. | ||
| 130 | def is_inside(self, tagname): | ||
| 131 | node = self.parent | ||
| 132 | while node: | ||
| 133 | if node.tagname == tagname: | ||
| 134 | return True | ||
| 135 | node = node.parent | ||
| 136 | return False | ||
| 137 | |||
| 138 | # Append a new child element. | ||
| 139 | def append_element(self, elem): | ||
| 140 | if elem.tagname == "body": | ||
| 141 | self.elements += elem.elements | ||
| 142 | else: | ||
| 143 | self.elements.append(elem) | ||
| 144 | |||
| 145 | # Append a new auxiliary block. | ||
| 146 | def append_aux(self, elem): | ||
| 147 | self.aux.append(elem) | ||
| 148 | |||
| 149 | # Append a new text node. | ||
| 150 | def append_data(self, data): | ||
| 151 | if self.tagname in structured_tags: | ||
| 152 | if data.strip(): | ||
| 153 | sys.stderr.write("Error: found unstructured data inside %s element at line %d\n" % (self.tagname, self.loc[0])) | ||
| 154 | return | ||
| 155 | if data: | ||
| 156 | self.elements.append(data) | ||
| 157 | |||
| 158 | # Close the current element. If a block tag, append it as a block | ||
| 159 | # to the parent; if an inline tag, convert it to inline text and | ||
| 160 | # append the text to the parent. Return the parent element. | ||
| 161 | def close(self): | ||
| 162 | if not self.parent: | ||
| 163 | sys.stderr.write("Error: attempted to close root tag\n") | ||
| 164 | return self | ||
| 165 | if self.tagname in block_tags: | ||
| 166 | self.parent.append_element(self) | ||
| 167 | else: | ||
| 168 | self.parent.append_data(self.to_md_inline()) | ||
| 169 | for item in self.aux: | ||
| 170 | self.parent.append_aux(item) | ||
| 171 | return self.parent | ||
| 172 | |||
| 173 | # Write a debug representation of the current subtree. | ||
| 174 | def debug(self, level): | ||
| 175 | sys.stderr.write(level + self.tagname + "\n") | ||
| 176 | for e in self.elements: | ||
| 177 | if isinstance(e, Tag): | ||
| 178 | e.debug(level + "| ") | ||
| 179 | else: | ||
| 180 | sys.stderr.write(level + "| " + repr(e) + "\n") | ||
| 181 | |||
| 182 | # Convert the current tag to inline markdown. | ||
| 183 | def to_md_inline(self): | ||
| 184 | if self.tagname == "br": | ||
| 185 | # Ignore <br> tags for now. | ||
| 186 | return "" | ||
| 187 | |||
| 188 | elif self.tagname == "img": | ||
| 189 | return self.img_to_md() | ||
| 190 | |||
| 191 | elif self.tagname == "i" or self.tagname == "em": | ||
| 192 | text = self.collect_inline_elements().strip() | ||
| 193 | return "*" + text + "*" if text else "" | ||
| 194 | |||
| 195 | elif self.tagname == "b" or self.tagname == "strong": | ||
| 196 | text = self.collect_inline_elements().strip() | ||
| 197 | return "**" + text + "**" if text else "" | ||
| 198 | |||
| 199 | elif self.tagname == "code" and self.is_inside("pre"): | ||
| 200 | return self.collect_inline_elements() | ||
| 201 | |||
| 202 | elif self.tagname == "code": | ||
| 203 | text = self.collect_inline_elements().strip() | ||
| 204 | return "``" + text + "``" if text else "" | ||
| 205 | |||
| 206 | elif self.tagname == "a": | ||
| 207 | href = self.attrs["href"] if "href" in self.attrs else "#" | ||
| 208 | text = self.collect_inline_elements().strip() | ||
| 209 | return "[" + text + "](%s)" % href | ||
| 210 | |||
| 211 | else: | ||
| 212 | return self.collect_inline_elements() | ||
| 213 | |||
| 214 | # Convert all child elements to inline markdown and collect them into | ||
| 215 | # one chunk of text. | ||
| 216 | def collect_inline_elements(self): | ||
| 217 | text = "" | ||
| 218 | for elem in self.elements: | ||
| 219 | if isinstance(elem, Tag): | ||
| 220 | elem = elem.to_md_inline() | ||
| 221 | text += elem | ||
| 222 | return text | ||
| 223 | |||
| 224 | # Convert a block element to markdown. | ||
| 225 | def to_md(self): | ||
| 226 | text = "" | ||
| 227 | |||
| 228 | if self.tagname == "title": | ||
| 229 | text = "" | ||
| 230 | |||
| 231 | elif self.tagname == "table": | ||
| 232 | text = self.table_to_md() | ||
| 233 | |||
| 234 | elif self.tagname == "pre": | ||
| 235 | text = indent(self.pre_to_md()) | ||
| 236 | |||
| 237 | elif self.tagname in heading_tags: | ||
| 238 | text = self.heading_to_md(heading_tags.index(self.tagname)) | ||
| 239 | |||
| 240 | elif self.tagname in ["ul", "ol"]: | ||
| 241 | text = self.list_to_md() | ||
| 242 | |||
| 243 | elif self.tagname == "dl": | ||
| 244 | text = self.dl_to_md() | ||
| 245 | |||
| 246 | else: | ||
| 247 | text = self.block_tag_to_md() | ||
| 248 | |||
| 249 | if not self.parent and self.aux: | ||
| 250 | text += "\n\n" + "\n\n".join(self.aux) | ||
| 251 | return text | ||
| 252 | |||
| 253 | # Return markdown for a heading element. | ||
| 254 | def heading_to_md(self, level): | ||
| 255 | blocks = [elem.to_md() if isinstance(elem, Tag) else elem for elem in self.elements] | ||
| 256 | text = "".join(blocks) | ||
| 257 | return "#" * (level + 1) + " " + text.replace("\n", " ") | ||
| 258 | |||
| 259 | # Return markdown for a <pre> element. | ||
| 260 | def pre_to_md(self): | ||
| 261 | blocks = [elem.pre_to_md() if isinstance(elem, Tag) else elem for elem in self.elements] | ||
| 262 | return "".join(blocks).strip("\n") | ||
| 263 | |||
| 264 | # Return markdown for a <ul> or <ol> element. | ||
| 265 | def list_to_md(self): | ||
| 266 | marker = "* " if self.tagname == "ul" else "1. " | ||
| 267 | elems = [] | ||
| 268 | for elem in self.elements: | ||
| 269 | if isinstance(elem, Tag) and elem.tagname == "li": | ||
| 270 | elems.append(hanging_indent(elem.to_md().strip(), marker)) | ||
| 271 | return "\n\n".join(elems) | ||
| 272 | |||
| 273 | # Return markdown for a <dl> element. | ||
| 274 | def dl_to_md(self): | ||
| 275 | dl_items = [] | ||
| 276 | dt_item = "" | ||
| 277 | dd_items = [] | ||
| 278 | text = "" | ||
| 279 | for elem in self.elements: | ||
| 280 | if isinstance(elem, Tag) and elem.tagname == "dt": | ||
| 281 | text = text.strip() | ||
| 282 | if dt_item or dd_items or text: | ||
| 283 | dd_items += form_paras(text) | ||
| 284 | dd_items = map(indent, dd_items) | ||
| 285 | dl_items.append(dt_item + "\n" + "\n\n".join(dd_items)) | ||
| 286 | dt_item = "" | ||
| 287 | dd_items = [] | ||
| 288 | text = "" | ||
| 289 | dt_item = elem.to_md() | ||
| 290 | else: | ||
| 291 | if isinstance(elem, Tag): | ||
| 292 | dd_items += form_paras(text) | ||
| 293 | text = "" | ||
| 294 | block = elem.to_md() | ||
| 295 | if block: | ||
| 296 | dd_items.append(block) | ||
| 297 | else: | ||
| 298 | text += elem | ||
| 299 | text = text.strip() | ||
| 300 | if dt_item or dd_items or text: | ||
| 301 | dd_items += form_paras(text) | ||
| 302 | dd_items = map(indent, dd_items) | ||
| 303 | dl_items.append(dt_item + "\n" + "\n\n".join(dd_items)) | ||
| 304 | return "\n\n".join(dl_items) | ||
| 305 | |||
| 306 | # Return markdown for an <img> element. | ||
| 307 | def img_to_md(self): | ||
| 308 | if "src" in self.attrs: | ||
| 309 | src = self.attrs["src"] | ||
| 310 | else: | ||
| 311 | src = "#" | ||
| 312 | sub = create_sub(src) | ||
| 313 | if "alt" in self.attrs: | ||
| 314 | alt = self.attrs["alt"] | ||
| 315 | else: | ||
| 316 | alt = "" | ||
| 317 | return "[!" + src + "]" | ||
| 318 | |||
| 319 | # Convert a table to markdown. | ||
| 320 | def table_to_md(self): | ||
| 321 | caption = "" | ||
| 322 | rows = [] | ||
| 323 | column_widths = [] | ||
| 324 | |||
| 325 | # Find the <caption> (if any) and the <tr> elements. | ||
| 326 | header_rows = 0 | ||
| 327 | for row in self.elements: | ||
| 328 | if not isinstance(row, Tag): | ||
| 329 | pass | ||
| 330 | elif row.tagname == "caption": | ||
| 331 | caption = row.to_md() | ||
| 332 | elif row.tagname == "tr": | ||
| 333 | # Extract the <th> and <td> elements. | ||
| 334 | cols = list(filter(lambda e: isinstance(e, Tag) and e.tagname in ["th","td"], row.elements)) | ||
| 335 | if all([e.tagname == "th" for e in cols]): | ||
| 336 | header_rows += 1 | ||
| 337 | if cols: | ||
| 338 | cols = list(map(lambda col: col.to_md(), cols)) | ||
| 339 | rows.append(cols) | ||
| 340 | # Record the max column width for each column | ||
| 341 | if len(cols) > len(column_widths): | ||
| 342 | column_widths.extend(0 for _ in range(len(column_widths), len(cols))) | ||
| 343 | for i in range(len(cols)): | ||
| 344 | column_widths[i] = max(column_widths[i], longest_line(cols[i])) | ||
| 345 | else: | ||
| 346 | sys.stderr.write("Found <%s> tag in <table> at %d\n" % (row.tagname, row.loc[0])) | ||
| 347 | |||
| 348 | # Generate single-row and single-column tables as regular paragraphs. | ||
| 349 | if len(column_widths) == 1 or len(rows) == 1: | ||
| 350 | blocks = [] | ||
| 351 | for row in rows: | ||
| 352 | blocks += row | ||
| 353 | return "\n\n".join(blocks) | ||
| 354 | |||
| 355 | # Generate the table, row by row. | ||
| 356 | text = "" | ||
| 357 | for row in rows: | ||
| 358 | col_num = 0 | ||
| 359 | padded_cells = [] | ||
| 360 | for cell in row: | ||
| 361 | if len(cell) < column_widths[col_num]: | ||
| 362 | cell += " " * (column_widths[col_num] - len(cell)) | ||
| 363 | padded_cells.append(cell) | ||
| 364 | col_num += 1 | ||
| 365 | text += "|" + "|".join(padded_cells) + "|\n" | ||
| 366 | |||
| 367 | return text | ||
| 368 | |||
| 369 | # Convert child elements to md, and combine the results. | ||
| 370 | def block_tag_to_md(self): | ||
| 371 | blocks = [] | ||
| 372 | text = "" | ||
| 373 | for elem in self.elements: | ||
| 374 | if isinstance(elem, Tag): | ||
| 375 | blocks += form_paras(text) | ||
| 376 | text = "" | ||
| 377 | block = elem.to_md() | ||
| 378 | if block: | ||
| 379 | blocks.append(block) | ||
| 380 | else: | ||
| 381 | text += elem | ||
| 382 | blocks += form_paras(text) | ||
| 383 | return "\n\n".join(blocks) | ||
| 384 | |||
| 385 | # Indent all lines of a paragraph by 4 spaces. | ||
| 386 | def indent(text): | ||
| 387 | lines = [] | ||
| 388 | for s in text.split("\n"): | ||
| 389 | if s: | ||
| 390 | lines.append(" " + s) | ||
| 391 | else: | ||
| 392 | lines.append("") | ||
| 393 | return "\n".join(lines) | ||
| 394 | |||
| 395 | # Place marker in front of the first line of text, and indent | ||
| 396 | # all subsequent lines. | ||
| 397 | def hanging_indent(text, marker): | ||
| 398 | prefix = " " * len(marker) | ||
| 399 | lines = [] | ||
| 400 | for s in text.split("\n"): | ||
| 401 | if lines: | ||
| 402 | lines.append(prefix + s.strip()) | ||
| 403 | elif s: | ||
| 404 | lines.append(marker + s.strip()) | ||
| 405 | else: | ||
| 406 | lines.append("") | ||
| 407 | return "\n".join(lines) | ||
| 408 | |||
| 409 | # Form paragraphs out of text, with single blank lines between each paragraph. | ||
| 410 | |||
| 411 | def form_paras(text): | ||
| 412 | lines = [s.strip() for s in text.split("\n")] | ||
| 413 | paras = [] | ||
| 414 | para = [] | ||
| 415 | for l in lines: | ||
| 416 | if l: | ||
| 417 | para.append(l) | ||
| 418 | elif para: | ||
| 419 | paras.append("\n".join(para)) | ||
| 420 | para = [] | ||
| 421 | if para: | ||
| 422 | paras.append("\n".join(para)) | ||
| 423 | return paras | ||
| 424 | |||
| 425 | # Find longest line in a paragraph. | ||
| 426 | |||
| 427 | def longest_line(text): | ||
| 428 | lengths = [len(s) for s in text.split("\n")] | ||
| 429 | return max(lengths) | ||
| 430 | |||
| 431 | # Class HtmlToMD: Parse the HTML source, build an intermediate tree, | ||
| 432 | # and convert to Markdown. | ||
| 433 | |||
| 434 | class HtmlToMD(HTMLParser): | ||
| 435 | def __init__(self): | ||
| 436 | self.current_tag = Tag("__root__", None, None, (0,0)) | ||
| 437 | self.title = "" | ||
| 438 | HTMLParser.__init__(self) | ||
| 439 | |||
| 440 | def close(self): | ||
| 441 | HTMLParser.close(self) | ||
| 442 | unclosed = [] | ||
| 443 | while self.current_tag.parent is not None: | ||
| 444 | if self.current_tag.tagname not in auto_closing_tags: | ||
| 445 | unclosed.append("%s (%d)" % (self.current_tag.tagname, self.current_tag.loc[0])) | ||
| 446 | self.current_tag = self.current_tag.close() | ||
| 447 | if unclosed: | ||
| 448 | sys.stderr.write("End of file with unclosed tags: %s\n" % ", ".join(unclosed)) | ||
| 449 | |||
| 450 | def handle_starttag(self, tagname, attrs): | ||
| 451 | if tagname in ignored_tags: | ||
| 452 | return | ||
| 453 | |||
| 454 | while (self.current_tag.tagname in auto_closing_tags | ||
| 455 | and not can_contain(self.current_tag.tagname, tagname)): | ||
| 456 | self.current_tag = self.current_tag.close() | ||
| 457 | |||
| 458 | if not can_contain(self.current_tag.tagname, tagname): | ||
| 459 | if self.current_tag.tagname == "table": | ||
| 460 | sys.stderr.write("Inserting missing <tr> before <%s> at line %d\n" % (tagname, self.getpos()[0])) | ||
| 461 | self.handle_starttag("tr", []) | ||
| 462 | elif self.current_tag.tagname == "tr": | ||
| 463 | sys.stderr.write("Inserting missing <td> before <%s> at line %d\n" % (tagname, self.getpos()[0])) | ||
| 464 | self.handle_starttag("td", []) | ||
| 465 | |||
| 466 | attrs = dict(attrs) | ||
| 467 | tag = Tag(tagname, attrs, self.current_tag, self.getpos()) | ||
| 468 | self.current_tag = tag | ||
| 469 | if tagname in unbalanced_tags: | ||
| 470 | self.current_tag = self.current_tag.close() | ||
| 471 | |||
| 472 | def handle_endtag(self, tagname): | ||
| 473 | if tagname in ignored_tags or tagname in unbalanced_tags: | ||
| 474 | return | ||
| 475 | |||
| 476 | if tagname == "title": | ||
| 477 | self.title = self.current_tag.to_md_inline() | ||
| 478 | |||
| 479 | if tagname != self.current_tag.tagname and self.current_tag.is_inside(tagname): | ||
| 480 | while tagname != self.current_tag.tagname: | ||
| 481 | if (self.current_tag.tagname in auto_closing_tags | ||
| 482 | and can_contain(tagname, self.current_tag.tagname)): | ||
| 483 | sys.stderr.write("Auto-closing %s at line %d\n" % (self.current_tag.tagname, self.getpos()[0])) | ||
| 484 | else: | ||
| 485 | sys.stderr.write("Force-closing %s at line %d\n" % (self.current_tag.tagname, self.getpos()[0])) | ||
| 486 | self.current_tag = self.current_tag.close() | ||
| 487 | |||
| 488 | if tagname == self.current_tag.tagname: | ||
| 489 | self.current_tag = self.current_tag.close() | ||
| 490 | else: | ||
| 491 | sys.stderr.write("Found </%s> at line %d, expected </%s>.\n" % (tagname, self.getpos()[0], self.current_tag.tagname)) | ||
| 492 | |||
| 493 | def handle_data(self, data): | ||
| 494 | # Convert old troff-style quoting conventions to Unicode quotes. | ||
| 495 | data = re.sub(r"``", u"\u201c", data) | ||
| 496 | data = re.sub(r"''", u"\u201d", data) | ||
| 497 | # Markup code terms like DW_AT_type and .debug_info. | ||
| 498 | data = re.sub(r'(?<!\w)([.a-zA-Z]+_[a-zA-Z0-9_]+)\b', r'`\1`', data) | ||
| 499 | self.current_tag.append_data(data) | ||
| 500 | |||
| 501 | def handle_entityref(self, name): | ||
| 502 | self.current_tag.append_data(unichr(name2codepoint[name])) | ||
| 503 | |||
| 504 | def handle_charref(self, name): | ||
| 505 | if name.startswith('x'): | ||
| 506 | c = unichr(int(name[1:], 16)) | ||
| 507 | else: | ||
| 508 | c = unichr(int(name)) | ||
| 509 | self.current_tag.append_data(c) | ||
| 510 | |||
| 511 | def debug(self): | ||
| 512 | self.current_tag.debug("") | ||
| 513 | |||
| 514 | def to_md(self): | ||
| 515 | return self.current_tag.to_md() | ||
| 516 | |||
| 517 | # Return a list of all descendent tags with the given tagname. | ||
| 518 | |||
| 519 | def find_tags(block, tagname): | ||
| 520 | ret = [] | ||
| 521 | for e in block.elements: | ||
| 522 | if isinstance(e, Tag): | ||
| 523 | if e.tagname == tagname: | ||
| 524 | ret.append(e) | ||
| 525 | else: | ||
| 526 | ret += find_tags(e, tagname) | ||
| 527 | return ret | ||
| 528 | |||
| 529 | # The DWARF pages all start with two table elements as the page header, | ||
| 530 | # and end with a table element as the page footer. Recognize these. | ||
| 531 | |||
| 532 | def is_header_footer(elem): | ||
| 533 | if not isinstance(elem, Tag): | ||
| 534 | return False | ||
| 535 | if elem.tagname != "table": | ||
| 536 | return False | ||
| 537 | for cell in find_tags(elem, "td"): | ||
| 538 | text = cell.to_md() | ||
| 539 | if "[!dwarf.png]" in text: | ||
| 540 | return True | ||
| 541 | if "[HOME]" in text: | ||
| 542 | return True | ||
| 543 | if "All logos and trademarks" in text: | ||
| 544 | return True | ||
| 545 | return False | ||
| 546 | |||
| 547 | strip_header_footer = False | ||
| 548 | convert_proposal = False | ||
| 549 | dwarf_version = "6" | ||
| 550 | debug = False | ||
| 551 | |||
| 552 | try: | ||
| 553 | opts, args = getopt.getopt(sys.argv[1:], "hdpv:") | ||
| 554 | except getopt.GetoptError as err: | ||
| 555 | sys.stderr.write(err + "\n") | ||
| 556 | sys.exit(2) | ||
| 557 | for o, a in opts: | ||
| 558 | if o == "-p": | ||
| 559 | convert_proposal = True | ||
| 560 | elif o == "-h": | ||
| 561 | strip_header_footer = True | ||
| 562 | elif o == "-v": | ||
| 563 | dwarf_version = a | ||
| 564 | elif o == "-d": | ||
| 565 | debug = True | ||
| 566 | |||
| 567 | parser = HtmlToMD() | ||
| 568 | |||
| 569 | for l in sys.stdin: | ||
| 570 | parser.feed(l.expandtabs(4).replace("\r\n", "\n")) | ||
| 571 | parser.close() | ||
| 572 | |||
| 573 | if debug: | ||
| 574 | parser.debug() | ||
| 575 | |||
| 576 | root = parser.current_tag | ||
| 577 | if strip_header_footer: | ||
| 578 | root.elements = list(filter(lambda e: not is_header_footer(e), root.elements)) | ||
| 579 | |||
| 580 | if convert_proposal: | ||
| 581 | table_elems = find_tags(root, "table") | ||
| 582 | if len(table_elems) > 0: | ||
| 583 | meta_info_tbl = table_elems[0] | ||
| 584 | if len(meta_info_tbl.elements) > 0: | ||
| 585 | first_row = meta_info_tbl.elements[0] | ||
| 586 | meta_info = first_row.elements | ||
| 587 | if len(meta_info) >= 6: | ||
| 588 | prop_num = meta_info[0].to_md() | ||
| 589 | prop_author = meta_info[1].to_md() | ||
| 590 | prop_title = meta_info[2].to_md() | ||
| 591 | prop_type = meta_info[3].to_md() | ||
| 592 | prop_status = meta_info[4].to_md() | ||
| 593 | prop_champion = meta_info[5].to_md() | ||
| 594 | prop_submit_date = re.sub(r"(\d\d)(\d\d)(\d\d)\..*", r"20\1-\2-\3", prop_num) | ||
| 595 | print("Title: %s" % prop_title) | ||
| 596 | print("Author: %s" % prop_author) | ||
| 597 | print("Champion: %s" % prop_champion) | ||
| 598 | print("Submit-Date: %s" % prop_submit_date) | ||
| 599 | print("Propid: %s" % prop_num) | ||
| 600 | print("Type: %s" % prop_type) | ||
| 601 | print("Status: %s" % prop_status) | ||
| 602 | print("Version: %s" % dwarf_version) | ||
| 603 | print("") | ||
| 604 | |||
| 605 | pre_elems = find_tags(parser.current_tag, "pre") | ||
| 606 | if pre_elems: | ||
| 607 | print(pre_elems[0].pre_to_md()) | ||
| 608 | |||
| 609 | else: | ||
| 610 | if parser.title: | ||
| 611 | print("Title: %s" % parser.title) | ||
| 612 | print("") | ||
| 613 | print(root.to_md()) | ||
