summaryrefslogtreecommitdiffstats
path: root/tools
diff options
authorCary Coutant <ccoutant@gmail.com>2023-03-23 17:14:58 -0700
committerCary Coutant <ccoutant@gmail.com>2023-03-23 17:14:58 -0700
commit49409d19ef4c6de858905775b3662e14ee735bcd (patch)
treea1830713349b8edd0693bd90efb7af269cf9282e /tools
parentissues/170427.3 Fix more typos found by Pedro Alves (diff)
Initial conversion to markdown format.
I've included a python script in tools/html-to-md.py that I used to do this conversion, in case anyone wants to reconstruct it. It does a rudimentary job, slightly tailored to the structure of the existing DWARF HTML pages. After the conversion, I did some manual tweaks, and rearranged a few pages.
Diffstat (limited to 'tools')
-rwxr-xr-xtools/html-to-md.py613
1 files changed, 613 insertions, 0 deletions
diff --git a/tools/html-to-md.py b/tools/html-to-md.py
new file mode 100755
index 0000000..ec6202b
--- /dev/null
+++ b/tools/html-to-md.py
@@ -0,0 +1,613 @@
1#! /usr/bin/python3
2
3# html-to-md
4# Rudimentary conversion of HTML to Markdown.
5# -h
6# Strips the header and footer elements from the page.
7# Looks for the specific patterns used in the DWARF HTML code.
8# -p
9# Tailors the conversion to DWARF issue pages, extracting
10# metadata from the start of the file, and using the
11# contents of the <pre> element as the markdown.
12# The metadata is placed at the top of the generated
13# markdown file.
14# -v n
15# Specifies the target DWARF version number (for issue metadata).
16# Default: 6.
17
18import sys
19import re
20import getopt
21from html.parser import HTMLParser
22from html.entities import name2codepoint
23
24# These tags are treated as block tags; i.e., they are laid out
25# as blocks on the page.
26
27block_tags = [
28 "__root__",
29 "html", "head", "meta", "title", "body",
30 "h1", "h2", "h3", "h4", "h5", "h6",
31 "div", "p",
32 "table", "thead", "tbody", "tr", "th", "td", "caption",
33 "pre", "ul", "ol", "li", "dl", "dt", "dd",
34 "hr"
35 ]
36
37# These tags have a well-defined structure, with specific
38# kinds of tags allowed within. Unstructured text inside
39# these tags will be ignored.
40
41structured_tags = [
42 "table", "thead", "tbody", "tr"
43 ]
44
45# These tags are allowed to contain other block tags.
46# Used to help decide whether to auto-close the current
47# tag or nest the new one inside.
48
49can_contain_block_tags = [
50 "html", "head", "body", "div",
51 "table", "thead", "tbody", "tr", "th", "td",
52 "pre", "ul", "ol", "li", "dl", "dd"
53 ]
54
55# These tags are treated as inline content, and may be
56# contained inside any other (non-structured) tag.
57
58inline_tags = [
59 "i", "b", "em", "strong", "code", "br", "img", "a", "small",
60 "button"
61 ]
62
63# These tags have optional closing tags, and should be
64# automatically closed when we see a starting tag that
65# cannot be contained inside it.
66
67auto_closing_tags = [
68 "meta", "body", "p", "li", "dt", "dd", "tr", "td", "th", "a"
69 ]
70
71# These tags do not require a closing tag.
72
73unbalanced_tags = [
74 "br", "hr", "img"
75 ]
76
77# We ignore the following tags for the purposes of conversion.
78
79ignored_tags = [
80 "html", "head", "thead", "tbody", "hr", "font"
81 ]
82
83# These are the heading tags, in order of precedence.
84
85heading_tags = [
86 "h1", "h2", "h3", "h4", "h5", "h6"
87 ]
88
89# Return True if a tag1 element can contain a tag2 element.
90
91def can_contain(tag1, tag2):
92 if tag1 == "a" and tag2 == "a":
93 return False
94 if tag1 == "li" and tag2 == "li":
95 return False
96 if tag1 in ["dt", "dd"] and tag2 in ["dt", "dd"]:
97 return False
98 if tag1 == "table" and tag2 not in ["tr", "caption"]:
99 return False
100 if tag1 == "tr" and tag2 not in ["th", "td"]:
101 return False
102 if tag2 == "body":
103 return False
104 return tag1 in can_contain_block_tags or tag2 in inline_tags
105
106# Create a substitution reference for ref and return the inline text.
107
108def create_sub(ref):
109 # TODO: create a unique substitution reference for ref.
110 return ref
111
112# Class Tag represents an HTML element.
113# tagname is the HTML tag.
114# attrs is a dictionary of the element's attributes.
115# parent is a pointer to the containing element.
116# loc is the line number within the HTML source.
117# aux is a list of auxiliary blocks (e.g., substitution definitions) to write
118# at the end of the document.
119
120class Tag:
121 def __init__(self, tagname, attrs, parent, loc):
122 self.tagname = tagname
123 self.attrs = attrs
124 self.parent = parent
125 self.elements = []
126 self.loc = loc
127 self.aux = []
128
129 # Return True if we are inside a tagname element.
130 def is_inside(self, tagname):
131 node = self.parent
132 while node:
133 if node.tagname == tagname:
134 return True
135 node = node.parent
136 return False
137
138 # Append a new child element.
139 def append_element(self, elem):
140 if elem.tagname == "body":
141 self.elements += elem.elements
142 else:
143 self.elements.append(elem)
144
145 # Append a new auxiliary block.
146 def append_aux(self, elem):
147 self.aux.append(elem)
148
149 # Append a new text node.
150 def append_data(self, data):
151 if self.tagname in structured_tags:
152 if data.strip():
153 sys.stderr.write("Error: found unstructured data inside %s element at line %d\n" % (self.tagname, self.loc[0]))
154 return
155 if data:
156 self.elements.append(data)
157
158 # Close the current element. If a block tag, append it as a block
159 # to the parent; if an inline tag, convert it to inline text and
160 # append the text to the parent. Return the parent element.
161 def close(self):
162 if not self.parent:
163 sys.stderr.write("Error: attempted to close root tag\n")
164 return self
165 if self.tagname in block_tags:
166 self.parent.append_element(self)
167 else:
168 self.parent.append_data(self.to_md_inline())
169 for item in self.aux:
170 self.parent.append_aux(item)
171 return self.parent
172
173 # Write a debug representation of the current subtree.
174 def debug(self, level):
175 sys.stderr.write(level + self.tagname + "\n")
176 for e in self.elements:
177 if isinstance(e, Tag):
178 e.debug(level + "| ")
179 else:
180 sys.stderr.write(level + "| " + repr(e) + "\n")
181
182 # Convert the current tag to inline markdown.
183 def to_md_inline(self):
184 if self.tagname == "br":
185 # Ignore <br> tags for now.
186 return ""
187
188 elif self.tagname == "img":
189 return self.img_to_md()
190
191 elif self.tagname == "i" or self.tagname == "em":
192 text = self.collect_inline_elements().strip()
193 return "*" + text + "*" if text else ""
194
195 elif self.tagname == "b" or self.tagname == "strong":
196 text = self.collect_inline_elements().strip()
197 return "**" + text + "**" if text else ""
198
199 elif self.tagname == "code" and self.is_inside("pre"):
200 return self.collect_inline_elements()
201
202 elif self.tagname == "code":
203 text = self.collect_inline_elements().strip()
204 return "``" + text + "``" if text else ""
205
206 elif self.tagname == "a":
207 href = self.attrs["href"] if "href" in self.attrs else "#"
208 text = self.collect_inline_elements().strip()
209 return "[" + text + "](%s)" % href
210
211 else:
212 return self.collect_inline_elements()
213
214 # Convert all child elements to inline markdown and collect them into
215 # one chunk of text.
216 def collect_inline_elements(self):
217 text = ""
218 for elem in self.elements:
219 if isinstance(elem, Tag):
220 elem = elem.to_md_inline()
221 text += elem
222 return text
223
224 # Convert a block element to markdown.
225 def to_md(self):
226 text = ""
227
228 if self.tagname == "title":
229 text = ""
230
231 elif self.tagname == "table":
232 text = self.table_to_md()
233
234 elif self.tagname == "pre":
235 text = indent(self.pre_to_md())
236
237 elif self.tagname in heading_tags:
238 text = self.heading_to_md(heading_tags.index(self.tagname))
239
240 elif self.tagname in ["ul", "ol"]:
241 text = self.list_to_md()
242
243 elif self.tagname == "dl":
244 text = self.dl_to_md()
245
246 else:
247 text = self.block_tag_to_md()
248
249 if not self.parent and self.aux:
250 text += "\n\n" + "\n\n".join(self.aux)
251 return text
252
253 # Return markdown for a heading element.
254 def heading_to_md(self, level):
255 blocks = [elem.to_md() if isinstance(elem, Tag) else elem for elem in self.elements]
256 text = "".join(blocks)
257 return "#" * (level + 1) + " " + text.replace("\n", " ")
258
259 # Return markdown for a <pre> element.
260 def pre_to_md(self):
261 blocks = [elem.pre_to_md() if isinstance(elem, Tag) else elem for elem in self.elements]
262 return "".join(blocks).strip("\n")
263
264 # Return markdown for a <ul> or <ol> element.
265 def list_to_md(self):
266 marker = "* " if self.tagname == "ul" else "1. "
267 elems = []
268 for elem in self.elements:
269 if isinstance(elem, Tag) and elem.tagname == "li":
270 elems.append(hanging_indent(elem.to_md().strip(), marker))
271 return "\n\n".join(elems)
272
273 # Return markdown for a <dl> element.
274 def dl_to_md(self):
275 dl_items = []
276 dt_item = ""
277 dd_items = []
278 text = ""
279 for elem in self.elements:
280 if isinstance(elem, Tag) and elem.tagname == "dt":
281 text = text.strip()
282 if dt_item or dd_items or text:
283 dd_items += form_paras(text)
284 dd_items = map(indent, dd_items)
285 dl_items.append(dt_item + "\n" + "\n\n".join(dd_items))
286 dt_item = ""
287 dd_items = []
288 text = ""
289 dt_item = elem.to_md()
290 else:
291 if isinstance(elem, Tag):
292 dd_items += form_paras(text)
293 text = ""
294 block = elem.to_md()
295 if block:
296 dd_items.append(block)
297 else:
298 text += elem
299 text = text.strip()
300 if dt_item or dd_items or text:
301 dd_items += form_paras(text)
302 dd_items = map(indent, dd_items)
303 dl_items.append(dt_item + "\n" + "\n\n".join(dd_items))
304 return "\n\n".join(dl_items)
305
306 # Return markdown for an <img> element.
307 def img_to_md(self):
308 if "src" in self.attrs:
309 src = self.attrs["src"]
310 else:
311 src = "#"
312 sub = create_sub(src)
313 if "alt" in self.attrs:
314 alt = self.attrs["alt"]
315 else:
316 alt = ""
317 return "[!" + src + "]"
318
319 # Convert a table to markdown.
320 def table_to_md(self):
321 caption = ""
322 rows = []
323 column_widths = []
324
325 # Find the <caption> (if any) and the <tr> elements.
326 header_rows = 0
327 for row in self.elements:
328 if not isinstance(row, Tag):
329 pass
330 elif row.tagname == "caption":
331 caption = row.to_md()
332 elif row.tagname == "tr":
333 # Extract the <th> and <td> elements.
334 cols = list(filter(lambda e: isinstance(e, Tag) and e.tagname in ["th","td"], row.elements))
335 if all([e.tagname == "th" for e in cols]):
336 header_rows += 1
337 if cols:
338 cols = list(map(lambda col: col.to_md(), cols))
339 rows.append(cols)
340 # Record the max column width for each column
341 if len(cols) > len(column_widths):
342 column_widths.extend(0 for _ in range(len(column_widths), len(cols)))
343 for i in range(len(cols)):
344 column_widths[i] = max(column_widths[i], longest_line(cols[i]))
345 else:
346 sys.stderr.write("Found <%s> tag in <table> at %d\n" % (row.tagname, row.loc[0]))
347
348 # Generate single-row and single-column tables as regular paragraphs.
349 if len(column_widths) == 1 or len(rows) == 1:
350 blocks = []
351 for row in rows:
352 blocks += row
353 return "\n\n".join(blocks)
354
355 # Generate the table, row by row.
356 text = ""
357 for row in rows:
358 col_num = 0
359 padded_cells = []
360 for cell in row:
361 if len(cell) < column_widths[col_num]:
362 cell += " " * (column_widths[col_num] - len(cell))
363 padded_cells.append(cell)
364 col_num += 1
365 text += "|" + "|".join(padded_cells) + "|\n"
366
367 return text
368
369 # Convert child elements to md, and combine the results.
370 def block_tag_to_md(self):
371 blocks = []
372 text = ""
373 for elem in self.elements:
374 if isinstance(elem, Tag):
375 blocks += form_paras(text)
376 text = ""
377 block = elem.to_md()
378 if block:
379 blocks.append(block)
380 else:
381 text += elem
382 blocks += form_paras(text)
383 return "\n\n".join(blocks)
384
385# Indent all lines of a paragraph by 4 spaces.
386def indent(text):
387 lines = []
388 for s in text.split("\n"):
389 if s:
390 lines.append(" " + s)
391 else:
392 lines.append("")
393 return "\n".join(lines)
394
395# Place marker in front of the first line of text, and indent
396# all subsequent lines.
397def hanging_indent(text, marker):
398 prefix = " " * len(marker)
399 lines = []
400 for s in text.split("\n"):
401 if lines:
402 lines.append(prefix + s.strip())
403 elif s:
404 lines.append(marker + s.strip())
405 else:
406 lines.append("")
407 return "\n".join(lines)
408
409# Form paragraphs out of text, with single blank lines between each paragraph.
410
411def form_paras(text):
412 lines = [s.strip() for s in text.split("\n")]
413 paras = []
414 para = []
415 for l in lines:
416 if l:
417 para.append(l)
418 elif para:
419 paras.append("\n".join(para))
420 para = []
421 if para:
422 paras.append("\n".join(para))
423 return paras
424
425# Find longest line in a paragraph.
426
427def longest_line(text):
428 lengths = [len(s) for s in text.split("\n")]
429 return max(lengths)
430
431# Class HtmlToMD: Parse the HTML source, build an intermediate tree,
432# and convert to Markdown.
433
434class HtmlToMD(HTMLParser):
435 def __init__(self):
436 self.current_tag = Tag("__root__", None, None, (0,0))
437 self.title = ""
438 HTMLParser.__init__(self)
439
440 def close(self):
441 HTMLParser.close(self)
442 unclosed = []
443 while self.current_tag.parent is not None:
444 if self.current_tag.tagname not in auto_closing_tags:
445 unclosed.append("%s (%d)" % (self.current_tag.tagname, self.current_tag.loc[0]))
446 self.current_tag = self.current_tag.close()
447 if unclosed:
448 sys.stderr.write("End of file with unclosed tags: %s\n" % ", ".join(unclosed))
449
450 def handle_starttag(self, tagname, attrs):
451 if tagname in ignored_tags:
452 return
453
454 while (self.current_tag.tagname in auto_closing_tags
455 and not can_contain(self.current_tag.tagname, tagname)):
456 self.current_tag = self.current_tag.close()
457
458 if not can_contain(self.current_tag.tagname, tagname):
459 if self.current_tag.tagname == "table":
460 sys.stderr.write("Inserting missing <tr> before <%s> at line %d\n" % (tagname, self.getpos()[0]))
461 self.handle_starttag("tr", [])
462 elif self.current_tag.tagname == "tr":
463 sys.stderr.write("Inserting missing <td> before <%s> at line %d\n" % (tagname, self.getpos()[0]))
464 self.handle_starttag("td", [])
465
466 attrs = dict(attrs)
467 tag = Tag(tagname, attrs, self.current_tag, self.getpos())
468 self.current_tag = tag
469 if tagname in unbalanced_tags:
470 self.current_tag = self.current_tag.close()
471
472 def handle_endtag(self, tagname):
473 if tagname in ignored_tags or tagname in unbalanced_tags:
474 return
475
476 if tagname == "title":
477 self.title = self.current_tag.to_md_inline()
478
479 if tagname != self.current_tag.tagname and self.current_tag.is_inside(tagname):
480 while tagname != self.current_tag.tagname:
481 if (self.current_tag.tagname in auto_closing_tags
482 and can_contain(tagname, self.current_tag.tagname)):
483 sys.stderr.write("Auto-closing %s at line %d\n" % (self.current_tag.tagname, self.getpos()[0]))
484 else:
485 sys.stderr.write("Force-closing %s at line %d\n" % (self.current_tag.tagname, self.getpos()[0]))
486 self.current_tag = self.current_tag.close()
487
488 if tagname == self.current_tag.tagname:
489 self.current_tag = self.current_tag.close()
490 else:
491 sys.stderr.write("Found </%s> at line %d, expected </%s>.\n" % (tagname, self.getpos()[0], self.current_tag.tagname))
492
493 def handle_data(self, data):
494 # Convert old troff-style quoting conventions to Unicode quotes.
495 data = re.sub(r"``", u"\u201c", data)
496 data = re.sub(r"''", u"\u201d", data)
497 # Markup code terms like DW_AT_type and .debug_info.
498 data = re.sub(r'(?<!\w)([.a-zA-Z]+_[a-zA-Z0-9_]+)\b', r'`\1`', data)
499 self.current_tag.append_data(data)
500
501 def handle_entityref(self, name):
502 self.current_tag.append_data(unichr(name2codepoint[name]))
503
504 def handle_charref(self, name):
505 if name.startswith('x'):
506 c = unichr(int(name[1:], 16))
507 else:
508 c = unichr(int(name))
509 self.current_tag.append_data(c)
510
511 def debug(self):
512 self.current_tag.debug("")
513
514 def to_md(self):
515 return self.current_tag.to_md()
516
517# Return a list of all descendent tags with the given tagname.
518
519def find_tags(block, tagname):
520 ret = []
521 for e in block.elements:
522 if isinstance(e, Tag):
523 if e.tagname == tagname:
524 ret.append(e)
525 else:
526 ret += find_tags(e, tagname)
527 return ret
528
529# The DWARF pages all start with two table elements as the page header,
530# and end with a table element as the page footer. Recognize these.
531
532def is_header_footer(elem):
533 if not isinstance(elem, Tag):
534 return False
535 if elem.tagname != "table":
536 return False
537 for cell in find_tags(elem, "td"):
538 text = cell.to_md()
539 if "[!dwarf.png]" in text:
540 return True
541 if "[HOME]" in text:
542 return True
543 if "All logos and trademarks" in text:
544 return True
545 return False
546
547strip_header_footer = False
548convert_proposal = False
549dwarf_version = "6"
550debug = False
551
552try:
553 opts, args = getopt.getopt(sys.argv[1:], "hdpv:")
554except getopt.GetoptError as err:
555 sys.stderr.write(err + "\n")
556 sys.exit(2)
557for o, a in opts:
558 if o == "-p":
559 convert_proposal = True
560 elif o == "-h":
561 strip_header_footer = True
562 elif o == "-v":
563 dwarf_version = a
564 elif o == "-d":
565 debug = True
566
567parser = HtmlToMD()
568
569for l in sys.stdin:
570 parser.feed(l.expandtabs(4).replace("\r\n", "\n"))
571parser.close()
572
573if debug:
574 parser.debug()
575
576root = parser.current_tag
577if strip_header_footer:
578 root.elements = list(filter(lambda e: not is_header_footer(e), root.elements))
579
580if convert_proposal:
581 table_elems = find_tags(root, "table")
582 if len(table_elems) > 0:
583 meta_info_tbl = table_elems[0]
584 if len(meta_info_tbl.elements) > 0:
585 first_row = meta_info_tbl.elements[0]
586 meta_info = first_row.elements
587 if len(meta_info) >= 6:
588 prop_num = meta_info[0].to_md()
589 prop_author = meta_info[1].to_md()
590 prop_title = meta_info[2].to_md()
591 prop_type = meta_info[3].to_md()
592 prop_status = meta_info[4].to_md()
593 prop_champion = meta_info[5].to_md()
594 prop_submit_date = re.sub(r"(\d\d)(\d\d)(\d\d)\..*", r"20\1-\2-\3", prop_num)
595 print("Title: %s" % prop_title)
596 print("Author: %s" % prop_author)
597 print("Champion: %s" % prop_champion)
598 print("Submit-Date: %s" % prop_submit_date)
599 print("Propid: %s" % prop_num)
600 print("Type: %s" % prop_type)
601 print("Status: %s" % prop_status)
602 print("Version: %s" % dwarf_version)
603 print("")
604
605 pre_elems = find_tags(parser.current_tag, "pre")
606 if pre_elems:
607 print(pre_elems[0].pre_to_md())
608
609else:
610 if parser.title:
611 print("Title: %s" % parser.title)
612 print("")
613 print(root.to_md())