Initial conversion to markdown format.

I've included a python script in tools/html-to-md.py that I used to do this conversion, in case anyone wants to reconstruct it. It does a rudimentary job, slightly tailored to the structure of the existing DWARF HTML pages. After the conversion, I did some manual tweaks, and rearranged a few pages.
author: Cary Coutant <ccoutant@gmail.com> 2023-03-23 17:14:58 -0700
committer: Cary Coutant <ccoutant@gmail.com> 2023-03-23 17:14:58 -0700
commit: 49409d19ef4c6de858905775b3662e14ee735bcd (patch)
tree: a1830713349b8edd0693bd90efb7af269cf9282e /tools
parent: issues/170427.3 Fix more typos found by Pedro Alves (diff)
1 files changed, 613 insertions, 0 deletions
diff --git a/tools/html-to-md.py b/tools/html-to-md.py
new file mode 100755
index 0000000..ec6202b
--- /dev/null
+++ b/tools/html-to-md.py
@@ -0,0 +1,613 @@
+#! /usr/bin/python3
+# html-to-md
+# Rudimentary conversion of HTML to Markdown.
+# -h
+# Strips the header and footer elements from the page.
+# Looks for the specific patterns used in the DWARF HTML code.
+# -p
+# Tailors the conversion to DWARF issue pages, extracting
+# metadata from the start of the file, and using the
+# contents of the <pre> element as the markdown.
+# The metadata is placed at the top of the generated
+# markdown file.
+# -v n
+# Specifies the target DWARF version number (for issue metadata).
+# Default: 6.
+import sys
+import re
+import getopt
+from html.parser import HTMLParser
+from html.entities import name2codepoint
+# These tags are treated as block tags; i.e., they are laid out
+# as blocks on the page.
+block_tags = [
+ "__root__",
+ "html", "head", "meta", "title", "body",
+ "h1", "h2", "h3", "h4", "h5", "h6",
+ "div", "p",
+ "table", "thead", "tbody", "tr", "th", "td", "caption",
+ "pre", "ul", "ol", "li", "dl", "dt", "dd",
+ "hr"
+ ]
+# These tags have a well-defined structure, with specific
+# kinds of tags allowed within. Unstructured text inside
+# these tags will be ignored.
+structured_tags = [
+ "table", "thead", "tbody", "tr"
+ ]
+# These tags are allowed to contain other block tags.
+# Used to help decide whether to auto-close the current
+# tag or nest the new one inside.
+can_contain_block_tags = [
+ "html", "head", "body", "div",
+ "table", "thead", "tbody", "tr", "th", "td",
+ "pre", "ul", "ol", "li", "dl", "dd"
+ ]
+# These tags are treated as inline content, and may be
+# contained inside any other (non-structured) tag.
+inline_tags = [
+ "i", "b", "em", "strong", "code", "br", "img", "a", "small",
+ "button"
+ ]
+# These tags have optional closing tags, and should be
+# automatically closed when we see a starting tag that
+# cannot be contained inside it.
+auto_closing_tags = [
+ "meta", "body", "p", "li", "dt", "dd", "tr", "td", "th", "a"
+ ]
+# These tags do not require a closing tag.
+unbalanced_tags = [
+ "br", "hr", "img"
+ ]
+# We ignore the following tags for the purposes of conversion.
+ignored_tags = [
+ "html", "head", "thead", "tbody", "hr", "font"
+ ]
+# These are the heading tags, in order of precedence.
+heading_tags = [
+ "h1", "h2", "h3", "h4", "h5", "h6"
+ ]
+# Return True if a tag1 element can contain a tag2 element.
+def can_contain(tag1, tag2):
+ if tag1 == "a" and tag2 == "a":
+ return False
+ if tag1 == "li" and tag2 == "li":
+ return False
+ if tag1 in ["dt", "dd"] and tag2 in ["dt", "dd"]:
+ return False
+ if tag1 == "table" and tag2 not in ["tr", "caption"]:
+ return False
+ if tag1 == "tr" and tag2 not in ["th", "td"]:
+ return False
+ if tag2 == "body":
+ return False
+ return tag1 in can_contain_block_tags or tag2 in inline_tags
+# Create a substitution reference for ref and return the inline text.
+def create_sub(ref):
+ # TODO: create a unique substitution reference for ref.
+ return ref
+# Class Tag represents an HTML element.
+# tagname is the HTML tag.
+# attrs is a dictionary of the element's attributes.
+# parent is a pointer to the containing element.
+# loc is the line number within the HTML source.
+# aux is a list of auxiliary blocks (e.g., substitution definitions) to write
+# at the end of the document.
+class Tag:
+ def __init__(self, tagname, attrs, parent, loc):
+ self.tagname = tagname
+ self.attrs = attrs
+ self.parent = parent
+ self.elements = []
+ self.loc = loc
+ self.aux = []
+ # Return True if we are inside a tagname element.
+ def is_inside(self, tagname):
+ node = self.parent
+ while node:
+ if node.tagname == tagname:
+ return True
+ node = node.parent
+ return False
+ # Append a new child element.
+ def append_element(self, elem):
+ if elem.tagname == "body":
+ self.elements += elem.elements
+ else:
+ self.elements.append(elem)
+ # Append a new auxiliary block.
+ def append_aux(self, elem):
+ self.aux.append(elem)
+ # Append a new text node.
+ def append_data(self, data):
+ if self.tagname in structured_tags:
+ if data.strip():
+ sys.stderr.write("Error: found unstructured data inside %s element at line %d\n" % (self.tagname, self.loc[0]))
+ return
+ if data:
+ self.elements.append(data)
+ # Close the current element. If a block tag, append it as a block
+ # to the parent; if an inline tag, convert it to inline text and
+ # append the text to the parent. Return the parent element.
+ def close(self):
+ if not self.parent:
+ sys.stderr.write("Error: attempted to close root tag\n")
+ return self
+ if self.tagname in block_tags:
+ self.parent.append_element(self)
+ else:
+ self.parent.append_data(self.to_md_inline())
+ for item in self.aux:
+ self.parent.append_aux(item)
+ return self.parent
+ # Write a debug representation of the current subtree.
+ def debug(self, level):
+ sys.stderr.write(level + self.tagname + "\n")
+ for e in self.elements:
+ if isinstance(e, Tag):
+ e.debug(level + "| ")
+ else:
+ sys.stderr.write(level + "| " + repr(e) + "\n")
+ # Convert the current tag to inline markdown.
+ def to_md_inline(self):
+ if self.tagname == "br":
+ # Ignore <br> tags for now.
+ return ""
+ elif self.tagname == "img":
+ return self.img_to_md()
+ elif self.tagname == "i" or self.tagname == "em":
+ text = self.collect_inline_elements().strip()
+ return "*" + text + "*" if text else ""
+ elif self.tagname == "b" or self.tagname == "strong":
+ text = self.collect_inline_elements().strip()
+ return "**" + text + "**" if text else ""
+ elif self.tagname == "code" and self.is_inside("pre"):
+ return self.collect_inline_elements()
+ elif self.tagname == "code":
+ text = self.collect_inline_elements().strip()
+ return "``" + text + "``" if text else ""
+ elif self.tagname == "a":
+ href = self.attrs["href"] if "href" in self.attrs else "#"
+ text = self.collect_inline_elements().strip()
+ return "[" + text + "](%s)" % href
+ else:
+ return self.collect_inline_elements()
+ # Convert all child elements to inline markdown and collect them into
+ # one chunk of text.
+ def collect_inline_elements(self):
+ text = ""
+ for elem in self.elements:
+ if isinstance(elem, Tag):
+ elem = elem.to_md_inline()
+ text += elem
+ return text
+ # Convert a block element to markdown.
+ def to_md(self):
+ text = ""
+ if self.tagname == "title":
+ text = ""
+ elif self.tagname == "table":
+ text = self.table_to_md()
+ elif self.tagname == "pre":
+ text = indent(self.pre_to_md())
+ elif self.tagname in heading_tags:
+ text = self.heading_to_md(heading_tags.index(self.tagname))
+ elif self.tagname in ["ul", "ol"]:
+ text = self.list_to_md()
+ elif self.tagname == "dl":
+ text = self.dl_to_md()
+ else:
+ text = self.block_tag_to_md()
+ if not self.parent and self.aux:
+ text += "\n\n" + "\n\n".join(self.aux)
+ return text
+ # Return markdown for a heading element.
+ def heading_to_md(self, level):
+ blocks = [elem.to_md() if isinstance(elem, Tag) else elem for elem in self.elements]
+ text = "".join(blocks)
+ return "#" * (level + 1) + " " + text.replace("\n", " ")
+ # Return markdown for a <pre> element.
+ def pre_to_md(self):
+ blocks = [elem.pre_to_md() if isinstance(elem, Tag) else elem for elem in self.elements]
+ return "".join(blocks).strip("\n")
+ # Return markdown for a <ul> or <ol> element.
+ def list_to_md(self):
+ marker = "* " if self.tagname == "ul" else "1. "
+ elems = []
+ for elem in self.elements:
+ if isinstance(elem, Tag) and elem.tagname == "li":
+ elems.append(hanging_indent(elem.to_md().strip(), marker))
+ return "\n\n".join(elems)
+ # Return markdown for a <dl> element.
+ def dl_to_md(self):
+ dl_items = []
+ dt_item = ""
+ dd_items = []
+ text = ""
+ for elem in self.elements:
+ if isinstance(elem, Tag) and elem.tagname == "dt":
+ text = text.strip()
+ if dt_item or dd_items or text:
+ dd_items += form_paras(text)
+ dd_items = map(indent, dd_items)
+ dl_items.append(dt_item + "\n" + "\n\n".join(dd_items))
+ dt_item = ""
+ dd_items = []
+ text = ""
+ dt_item = elem.to_md()
+ else:
+ if isinstance(elem, Tag):
+ dd_items += form_paras(text)
+ text = ""
+ block = elem.to_md()
+ if block:
+ dd_items.append(block)
+ else:
+ text += elem
+ text = text.strip()
+ if dt_item or dd_items or text:
+ dd_items += form_paras(text)
+ dd_items = map(indent, dd_items)
+ dl_items.append(dt_item + "\n" + "\n\n".join(dd_items))
+ return "\n\n".join(dl_items)
+ # Return markdown for an <img> element.
+ def img_to_md(self):
+ if "src" in self.attrs:
+ src = self.attrs["src"]
+ else:
+ src = "#"
+ sub = create_sub(src)
+ if "alt" in self.attrs:
+ alt = self.attrs["alt"]
+ else:
+ alt = ""
+ return "[!" + src + "]"
+ # Convert a table to markdown.
+ def table_to_md(self):
+ caption = ""
+ rows = []
+ column_widths = []
+ # Find the <caption> (if any) and the <tr> elements.
+ header_rows = 0
+ for row in self.elements:
+ if not isinstance(row, Tag):
+ pass
+ elif row.tagname == "caption":
+ caption = row.to_md()
+ elif row.tagname == "tr":
+ # Extract the <th> and <td> elements.
+ cols = list(filter(lambda e: isinstance(e, Tag) and e.tagname in ["th","td"], row.elements))
+ if all([e.tagname == "th" for e in cols]):
+ header_rows += 1
+ if cols:
+ cols = list(map(lambda col: col.to_md(), cols))
+ rows.append(cols)
+ # Record the max column width for each column
+ if len(cols) > len(column_widths):
+ column_widths.extend(0 for _ in range(len(column_widths), len(cols)))
+ for i in range(len(cols)):
+ column_widths[i] = max(column_widths[i], longest_line(cols[i]))
+ else:
+ sys.stderr.write("Found <%s> tag in <table> at %d\n" % (row.tagname, row.loc[0]))
+ # Generate single-row and single-column tables as regular paragraphs.
+ if len(column_widths) == 1 or len(rows) == 1:
+ blocks = []
+ for row in rows:
+ blocks += row
+ return "\n\n".join(blocks)
+ # Generate the table, row by row.
+ text = ""
+ for row in rows:
+ col_num = 0
+ padded_cells = []
+ for cell in row:
+ if len(cell) < column_widths[col_num]:
+ cell += " " * (column_widths[col_num] - len(cell))
+ padded_cells.append(cell)
+ col_num += 1
+ text += "|" + "|".join(padded_cells) + "|\n"
+ return text
+ # Convert child elements to md, and combine the results.
+ def block_tag_to_md(self):
+ blocks = []
+ text = ""
+ for elem in self.elements:
+ if isinstance(elem, Tag):
+ blocks += form_paras(text)
+ text = ""
+ block = elem.to_md()
+ if block:
+ blocks.append(block)
+ else:
+ text += elem
+ blocks += form_paras(text)
+ return "\n\n".join(blocks)
+# Indent all lines of a paragraph by 4 spaces.
+def indent(text):
+ lines = []
+ for s in text.split("\n"):
+ if s:
+ lines.append(" " + s)
+ else:
+ lines.append("")
+ return "\n".join(lines)
+# Place marker in front of the first line of text, and indent
+# all subsequent lines.
+def hanging_indent(text, marker):
+ prefix = " " * len(marker)
+ lines = []
+ for s in text.split("\n"):
+ if lines:
+ lines.append(prefix + s.strip())
+ elif s:
+ lines.append(marker + s.strip())
+ else:
+ lines.append("")
+ return "\n".join(lines)
+# Form paragraphs out of text, with single blank lines between each paragraph.
+def form_paras(text):
+ lines = [s.strip() for s in text.split("\n")]
+ paras = []
+ para = []
+ for l in lines:
+ if l:
+ para.append(l)
+ elif para:
+ paras.append("\n".join(para))
+ para = []
+ if para:
+ paras.append("\n".join(para))
+ return paras
+# Find longest line in a paragraph.
+def longest_line(text):
+ lengths = [len(s) for s in text.split("\n")]
+ return max(lengths)
+# Class HtmlToMD: Parse the HTML source, build an intermediate tree,
+# and convert to Markdown.
+class HtmlToMD(HTMLParser):
+ def __init__(self):
+ self.current_tag = Tag("__root__", None, None, (0,0))
+ self.title = ""
+ HTMLParser.__init__(self)
+ def close(self):
+ HTMLParser.close(self)
+ unclosed = []
+ while self.current_tag.parent is not None:
+ if self.current_tag.tagname not in auto_closing_tags:
+ unclosed.append("%s (%d)" % (self.current_tag.tagname, self.current_tag.loc[0]))
+ self.current_tag = self.current_tag.close()
+ if unclosed:
+ sys.stderr.write("End of file with unclosed tags: %s\n" % ", ".join(unclosed))
+ def handle_starttag(self, tagname, attrs):
+ if tagname in ignored_tags:
+ return
+ while (self.current_tag.tagname in auto_closing_tags
+ and not can_contain(self.current_tag.tagname, tagname)):
+ self.current_tag = self.current_tag.close()
+ if not can_contain(self.current_tag.tagname, tagname):
+ if self.current_tag.tagname == "table":
+ sys.stderr.write("Inserting missing <tr> before <%s> at line %d\n" % (tagname, self.getpos()[0]))
+ self.handle_starttag("tr", [])
+ elif self.current_tag.tagname == "tr":
+ sys.stderr.write("Inserting missing <td> before <%s> at line %d\n" % (tagname, self.getpos()[0]))
+ self.handle_starttag("td", [])
+ attrs = dict(attrs)
+ tag = Tag(tagname, attrs, self.current_tag, self.getpos())
+ self.current_tag = tag
+ if tagname in unbalanced_tags:
+ self.current_tag = self.current_tag.close()
+ def handle_endtag(self, tagname):
+ if tagname in ignored_tags or tagname in unbalanced_tags:
+ return
+ if tagname == "title":
+ self.title = self.current_tag.to_md_inline()
+ if tagname != self.current_tag.tagname and self.current_tag.is_inside(tagname):
+ while tagname != self.current_tag.tagname:
+ if (self.current_tag.tagname in auto_closing_tags
+ and can_contain(tagname, self.current_tag.tagname)):
+ sys.stderr.write("Auto-closing %s at line %d\n" % (self.current_tag.tagname, self.getpos()[0]))
+ else:
+ sys.stderr.write("Force-closing %s at line %d\n" % (self.current_tag.tagname, self.getpos()[0]))
+ self.current_tag = self.current_tag.close()
+ if tagname == self.current_tag.tagname:
+ self.current_tag = self.current_tag.close()
+ else:
+ sys.stderr.write("Found </%s> at line %d, expected </%s>.\n" % (tagname, self.getpos()[0], self.current_tag.tagname))
+ def handle_data(self, data):
+ # Convert old troff-style quoting conventions to Unicode quotes.
+ data = re.sub(r"``", u"\u201c", data)
+ data = re.sub(r"''", u"\u201d", data)
+ # Markup code terms like DW_AT_type and .debug_info.
+ data = re.sub(r'(?<!\w)([.a-zA-Z]+_[a-zA-Z0-9_]+)\b', r'`\1`', data)
+ self.current_tag.append_data(data)
+ def handle_entityref(self, name):
+ self.current_tag.append_data(unichr(name2codepoint[name]))
+ def handle_charref(self, name):
+ if name.startswith('x'):
+ c = unichr(int(name[1:], 16))
+ else:
+ c = unichr(int(name))
+ self.current_tag.append_data(c)
+ def debug(self):
+ self.current_tag.debug("")
+ def to_md(self):
+ return self.current_tag.to_md()
+# Return a list of all descendent tags with the given tagname.
+def find_tags(block, tagname):
+ ret = []
+ for e in block.elements:
+ if isinstance(e, Tag):
+ if e.tagname == tagname:
+ ret.append(e)
+ else:
+ ret += find_tags(e, tagname)
+ return ret
+# The DWARF pages all start with two table elements as the page header,
+# and end with a table element as the page footer. Recognize these.
+def is_header_footer(elem):
+ if not isinstance(elem, Tag):
+ return False
+ if elem.tagname != "table":
+ return False
+ for cell in find_tags(elem, "td"):
+ text = cell.to_md()
+ if "[!dwarf.png]" in text:
+ return True
+ if "[HOME]" in text:
+ return True
+ if "All logos and trademarks" in text:
+ return True
+ return False
+strip_header_footer = False
+convert_proposal = False
+dwarf_version = "6"
+debug = False
+try:
+ opts, args = getopt.getopt(sys.argv[1:], "hdpv:")
+except getopt.GetoptError as err:
+ sys.stderr.write(err + "\n")
+ sys.exit(2)
+for o, a in opts:
+ if o == "-p":
+ convert_proposal = True
+ elif o == "-h":
+ strip_header_footer = True
+ elif o == "-v":
+ dwarf_version = a
+ elif o == "-d":
+ debug = True
+parser = HtmlToMD()
+for l in sys.stdin:
+ parser.feed(l.expandtabs(4).replace("\r\n", "\n"))
+parser.close()
+if debug:
+ parser.debug()
+root = parser.current_tag
+if strip_header_footer:
+ root.elements = list(filter(lambda e: not is_header_footer(e), root.elements))
+if convert_proposal:
+ table_elems = find_tags(root, "table")
+ if len(table_elems) > 0:
+ meta_info_tbl = table_elems[0]
+ if len(meta_info_tbl.elements) > 0:
+ first_row = meta_info_tbl.elements[0]
+ meta_info = first_row.elements
+ if len(meta_info) >= 6:
+ prop_num = meta_info[0].to_md()
+ prop_author = meta_info[1].to_md()
+ prop_title = meta_info[2].to_md()
+ prop_type = meta_info[3].to_md()
+ prop_status = meta_info[4].to_md()
+ prop_champion = meta_info[5].to_md()
+ prop_submit_date = re.sub(r"(\d\d)(\d\d)(\d\d)\..*", r"20\1-\2-\3", prop_num)
+ print("Title: %s" % prop_title)
+ print("Author: %s" % prop_author)
+ print("Champion: %s" % prop_champion)
+ print("Submit-Date: %s" % prop_submit_date)
+ print("Propid: %s" % prop_num)
+ print("Type: %s" % prop_type)
+ print("Status: %s" % prop_status)
+ print("Version: %s" % dwarf_version)
+ print("")
+ pre_elems = find_tags(parser.current_tag, "pre")
+ if pre_elems:
+ print(pre_elems[0].pre_to_md())
+else:
+ if parser.title:
+ print("Title: %s" % parser.title)
+ print("")
+ print(root.to_md())
author	Cary Coutant <ccoutant@gmail.com>	2023-03-23 17:14:58 -0700
committer	Cary Coutant <ccoutant@gmail.com>	2023-03-23 17:14:58 -0700
commit	49409d19ef4c6de858905775b3662e14ee735bcd (patch)
tree	a1830713349b8edd0693bd90efb7af269cf9282e /tools
parent	issues/170427.3 Fix more typos found by Pedro Alves (diff)

diff --git a/tools/html-to-md.py b/tools/html-to-md.py new file mode 100755 index 0000000..ec6202b --- /dev/null +++ b/tools/html-to-md.py
@@ -0,0 +1,613 @@
	1	#! /usr/bin/python3
	2
	3	# html-to-md
	4	# Rudimentary conversion of HTML to Markdown.
	5	# -h
	6	# Strips the header and footer elements from the page.
	7	# Looks for the specific patterns used in the DWARF HTML code.
	8	# -p
	9	# Tailors the conversion to DWARF issue pages, extracting
	10	# metadata from the start of the file, and using the
	11	# contents of the <pre> element as the markdown.
	12	# The metadata is placed at the top of the generated
	13	# markdown file.
	14	# -v n
	15	# Specifies the target DWARF version number (for issue metadata).
	16	# Default: 6.
	17
	18	import sys
	19	import re
	20	import getopt
	21	from html.parser import HTMLParser
	22	from html.entities import name2codepoint
	23
	24	# These tags are treated as block tags; i.e., they are laid out
	25	# as blocks on the page.
	26
	27	block_tags = [
	28	"__root__",
	29	"html", "head", "meta", "title", "body",
	30	"h1", "h2", "h3", "h4", "h5", "h6",
	31	"div", "p",
	32	"table", "thead", "tbody", "tr", "th", "td", "caption",
	33	"pre", "ul", "ol", "li", "dl", "dt", "dd",
	34	"hr"
	35	]
	36
	37	# These tags have a well-defined structure, with specific
	38	# kinds of tags allowed within. Unstructured text inside
	39	# these tags will be ignored.
	40
	41	structured_tags = [
	42	"table", "thead", "tbody", "tr"
	43	]
	44
	45	# These tags are allowed to contain other block tags.
	46	# Used to help decide whether to auto-close the current
	47	# tag or nest the new one inside.
	48
	49	can_contain_block_tags = [
	50	"html", "head", "body", "div",
	51	"table", "thead", "tbody", "tr", "th", "td",
	52	"pre", "ul", "ol", "li", "dl", "dd"
	53	]
	54
	55	# These tags are treated as inline content, and may be
	56	# contained inside any other (non-structured) tag.
	57
	58	inline_tags = [
	59	"i", "b", "em", "strong", "code", "br", "img", "a", "small",
	60	"button"
	61	]
	62
	63	# These tags have optional closing tags, and should be
	64	# automatically closed when we see a starting tag that
	65	# cannot be contained inside it.
	66
	67	auto_closing_tags = [
	68	"meta", "body", "p", "li", "dt", "dd", "tr", "td", "th", "a"
	69	]
	70
	71	# These tags do not require a closing tag.
	72
	73	unbalanced_tags = [
	74	"br", "hr", "img"
	75	]
	76
	77	# We ignore the following tags for the purposes of conversion.
	78
	79	ignored_tags = [
	80	"html", "head", "thead", "tbody", "hr", "font"
	81	]
	82
	83	# These are the heading tags, in order of precedence.
	84
	85	heading_tags = [
	86	"h1", "h2", "h3", "h4", "h5", "h6"
	87	]
	88
	89	# Return True if a tag1 element can contain a tag2 element.
	90
	91	def can_contain(tag1, tag2):
	92	if tag1 == "a" and tag2 == "a":
	93	return False
	94	if tag1 == "li" and tag2 == "li":
	95	return False
	96	if tag1 in ["dt", "dd"] and tag2 in ["dt", "dd"]:
	97	return False
	98	if tag1 == "table" and tag2 not in ["tr", "caption"]:
	99	return False
	100	if tag1 == "tr" and tag2 not in ["th", "td"]:
	101	return False
	102	if tag2 == "body":
	103	return False
	104	return tag1 in can_contain_block_tags or tag2 in inline_tags
	105
	106	# Create a substitution reference for ref and return the inline text.
	107
	108	def create_sub(ref):
	109	# TODO: create a unique substitution reference for ref.
	110	return ref
	111
	112	# Class Tag represents an HTML element.
	113	# tagname is the HTML tag.
	114	# attrs is a dictionary of the element's attributes.
	115	# parent is a pointer to the containing element.
	116	# loc is the line number within the HTML source.
	117	# aux is a list of auxiliary blocks (e.g., substitution definitions) to write
	118	# at the end of the document.
	119
	120	class Tag:
	121	def __init__(self, tagname, attrs, parent, loc):
	122	self.tagname = tagname
	123	self.attrs = attrs
	124	self.parent = parent
	125	self.elements = []
	126	self.loc = loc
	127	self.aux = []
	128
	129	# Return True if we are inside a tagname element.
	130	def is_inside(self, tagname):
	131	node = self.parent
	132	while node:
	133	if node.tagname == tagname:
	134	return True
	135	node = node.parent
	136	return False
	137
	138	# Append a new child element.
	139	def append_element(self, elem):
	140	if elem.tagname == "body":
	141	self.elements += elem.elements
	142	else:
	143	self.elements.append(elem)
	144
	145	# Append a new auxiliary block.
	146	def append_aux(self, elem):
	147	self.aux.append(elem)
	148
	149	# Append a new text node.
	150	def append_data(self, data):
	151	if self.tagname in structured_tags:
	152	if data.strip():
	153	sys.stderr.write("Error: found unstructured data inside %s element at line %d\n" % (self.tagname, self.loc[0]))
	154	return
	155	if data:
	156	self.elements.append(data)
	157
	158	# Close the current element. If a block tag, append it as a block
	159	# to the parent; if an inline tag, convert it to inline text and
	160	# append the text to the parent. Return the parent element.
	161	def close(self):
	162	if not self.parent:
	163	sys.stderr.write("Error: attempted to close root tag\n")
	164	return self
	165	if self.tagname in block_tags:
	166	self.parent.append_element(self)
	167	else:
	168	self.parent.append_data(self.to_md_inline())
	169	for item in self.aux:
	170	self.parent.append_aux(item)
	171	return self.parent
	172
	173	# Write a debug representation of the current subtree.
	174	def debug(self, level):
	175	sys.stderr.write(level + self.tagname + "\n")
	176	for e in self.elements:
	177	if isinstance(e, Tag):
	178	e.debug(level + "\| ")
	179	else:
	180	sys.stderr.write(level + "\| " + repr(e) + "\n")
	181
	182	# Convert the current tag to inline markdown.
	183	def to_md_inline(self):
	184	if self.tagname == "br":
	185	# Ignore <br> tags for now.
	186	return ""
	187
	188	elif self.tagname == "img":
	189	return self.img_to_md()
	190
	191	elif self.tagname == "i" or self.tagname == "em":
	192	text = self.collect_inline_elements().strip()
	193	return "" + text + "" if text else ""
	194
	195	elif self.tagname == "b" or self.tagname == "strong":
	196	text = self.collect_inline_elements().strip()
	197	return "" + text + "" if text else ""
	198
	199	elif self.tagname == "code" and self.is_inside("pre"):
	200	return self.collect_inline_elements()
	201
	202	elif self.tagname == "code":
	203	text = self.collect_inline_elements().strip()
	204	return "``" + text + "``" if text else ""
	205
	206	elif self.tagname == "a":
	207	href = self.attrs["href"] if "href" in self.attrs else "#"
	208	text = self.collect_inline_elements().strip()
	209	return "[" + text + "](%s)" % href
	210
	211	else:
	212	return self.collect_inline_elements()
	213
	214	# Convert all child elements to inline markdown and collect them into
	215	# one chunk of text.
	216	def collect_inline_elements(self):
	217	text = ""
	218	for elem in self.elements:
	219	if isinstance(elem, Tag):
	220	elem = elem.to_md_inline()
	221	text += elem
	222	return text
	223
	224	# Convert a block element to markdown.
	225	def to_md(self):
	226	text = ""
	227
	228	if self.tagname == "title":
	229	text = ""
	230
	231	elif self.tagname == "table":
	232	text = self.table_to_md()
	233
	234	elif self.tagname == "pre":
	235	text = indent(self.pre_to_md())
	236
	237	elif self.tagname in heading_tags:
	238	text = self.heading_to_md(heading_tags.index(self.tagname))
	239
	240	elif self.tagname in ["ul", "ol"]:
	241	text = self.list_to_md()
	242
	243	elif self.tagname == "dl":
	244	text = self.dl_to_md()
	245
	246	else:
	247	text = self.block_tag_to_md()
	248
	249	if not self.parent and self.aux:
	250	text += "\n\n" + "\n\n".join(self.aux)
	251	return text
	252
	253	# Return markdown for a heading element.
	254	def heading_to_md(self, level):
	255	blocks = [elem.to_md() if isinstance(elem, Tag) else elem for elem in self.elements]
	256	text = "".join(blocks)
	257	return "#" * (level + 1) + " " + text.replace("\n", " ")
	258
	259	# Return markdown for a <pre> element.
	260	def pre_to_md(self):
	261	blocks = [elem.pre_to_md() if isinstance(elem, Tag) else elem for elem in self.elements]
	262	return "".join(blocks).strip("\n")
	263
	264	# Return markdown for a <ul> or <ol> element.
	265	def list_to_md(self):
	266	marker = "* " if self.tagname == "ul" else "1. "
	267	elems = []
	268	for elem in self.elements:
	269	if isinstance(elem, Tag) and elem.tagname == "li":
	270	elems.append(hanging_indent(elem.to_md().strip(), marker))
	271	return "\n\n".join(elems)
	272
	273	# Return markdown for a <dl> element.
	274	def dl_to_md(self):
	275	dl_items = []
	276	dt_item = ""
	277	dd_items = []
	278	text = ""
	279	for elem in self.elements:
	280	if isinstance(elem, Tag) and elem.tagname == "dt":
	281	text = text.strip()
	282	if dt_item or dd_items or text:
	283	dd_items += form_paras(text)
	284	dd_items = map(indent, dd_items)
	285	dl_items.append(dt_item + "\n" + "\n\n".join(dd_items))
	286	dt_item = ""
	287	dd_items = []
	288	text = ""
	289	dt_item = elem.to_md()
	290	else:
	291	if isinstance(elem, Tag):
	292	dd_items += form_paras(text)
	293	text = ""
	294	block = elem.to_md()
	295	if block:
	296	dd_items.append(block)
	297	else:
	298	text += elem
	299	text = text.strip()
	300	if dt_item or dd_items or text:
	301	dd_items += form_paras(text)
	302	dd_items = map(indent, dd_items)
	303	dl_items.append(dt_item + "\n" + "\n\n".join(dd_items))
	304	return "\n\n".join(dl_items)
	305
	306	# Return markdown for an <img> element.
	307	def img_to_md(self):
	308	if "src" in self.attrs:
	309	src = self.attrs["src"]
	310	else:
	311	src = "#"
	312	sub = create_sub(src)
	313	if "alt" in self.attrs:
	314	alt = self.attrs["alt"]
	315	else:
	316	alt = ""
	317	return "[!" + src + "]"
	318
	319	# Convert a table to markdown.
	320	def table_to_md(self):
	321	caption = ""
	322	rows = []
	323	column_widths = []
	324
	325	# Find the <caption> (if any) and the <tr> elements.
	326	header_rows = 0
	327	for row in self.elements:
	328	if not isinstance(row, Tag):
	329	pass
	330	elif row.tagname == "caption":
	331	caption = row.to_md()
	332	elif row.tagname == "tr":
	333	# Extract the <th> and <td> elements.
	334	cols = list(filter(lambda e: isinstance(e, Tag) and e.tagname in ["th","td"], row.elements))
	335	if all([e.tagname == "th" for e in cols]):
	336	header_rows += 1
	337	if cols:
	338	cols = list(map(lambda col: col.to_md(), cols))
	339	rows.append(cols)
	340	# Record the max column width for each column
	341	if len(cols) > len(column_widths):
	342	column_widths.extend(0 for _ in range(len(column_widths), len(cols)))
	343	for i in range(len(cols)):
	344	column_widths[i] = max(column_widths[i], longest_line(cols[i]))
	345	else:
	346	sys.stderr.write("Found <%s> tag in <table> at %d\n" % (row.tagname, row.loc[0]))
	347
	348	# Generate single-row and single-column tables as regular paragraphs.
	349	if len(column_widths) == 1 or len(rows) == 1:
	350	blocks = []
	351	for row in rows:
	352	blocks += row
	353	return "\n\n".join(blocks)
	354
	355	# Generate the table, row by row.
	356	text = ""
	357	for row in rows:
	358	col_num = 0
	359	padded_cells = []
	360	for cell in row:
	361	if len(cell) < column_widths[col_num]:
	362	cell += " " * (column_widths[col_num] - len(cell))
	363	padded_cells.append(cell)
	364	col_num += 1
	365	text += "\|" + "\|".join(padded_cells) + "\|\n"
	366
	367	return text
	368
	369	# Convert child elements to md, and combine the results.
	370	def block_tag_to_md(self):
	371	blocks = []
	372	text = ""
	373	for elem in self.elements:
	374	if isinstance(elem, Tag):
	375	blocks += form_paras(text)
	376	text = ""
	377	block = elem.to_md()
	378	if block:
	379	blocks.append(block)
	380	else:
	381	text += elem
	382	blocks += form_paras(text)
	383	return "\n\n".join(blocks)
	384
	385	# Indent all lines of a paragraph by 4 spaces.
	386	def indent(text):
	387	lines = []
	388	for s in text.split("\n"):
	389	if s:
	390	lines.append(" " + s)
	391	else:
	392	lines.append("")
	393	return "\n".join(lines)
	394
	395	# Place marker in front of the first line of text, and indent
	396	# all subsequent lines.
	397	def hanging_indent(text, marker):
	398	prefix = " " * len(marker)
	399	lines = []
	400	for s in text.split("\n"):
	401	if lines:
	402	lines.append(prefix + s.strip())
	403	elif s:
	404	lines.append(marker + s.strip())
	405	else:
	406	lines.append("")
	407	return "\n".join(lines)
	408
	409	# Form paragraphs out of text, with single blank lines between each paragraph.
	410
	411	def form_paras(text):
	412	lines = [s.strip() for s in text.split("\n")]
	413	paras = []
	414	para = []
	415	for l in lines:
	416	if l:
	417	para.append(l)
	418	elif para:
	419	paras.append("\n".join(para))
	420	para = []
	421	if para:
	422	paras.append("\n".join(para))
	423	return paras
	424
	425	# Find longest line in a paragraph.
	426
	427	def longest_line(text):
	428	lengths = [len(s) for s in text.split("\n")]
	429	return max(lengths)
	430
	431	# Class HtmlToMD: Parse the HTML source, build an intermediate tree,
	432	# and convert to Markdown.
	433
	434	class HtmlToMD(HTMLParser):
	435	def __init__(self):
	436	self.current_tag = Tag("__root__", None, None, (0,0))
	437	self.title = ""
	438	HTMLParser.__init__(self)
	439
	440	def close(self):
	441	HTMLParser.close(self)
	442	unclosed = []
	443	while self.current_tag.parent is not None:
	444	if self.current_tag.tagname not in auto_closing_tags:
	445	unclosed.append("%s (%d)" % (self.current_tag.tagname, self.current_tag.loc[0]))
	446	self.current_tag = self.current_tag.close()
	447	if unclosed:
	448	sys.stderr.write("End of file with unclosed tags: %s\n" % ", ".join(unclosed))
	449
	450	def handle_starttag(self, tagname, attrs):
	451	if tagname in ignored_tags:
	452	return
	453
	454	while (self.current_tag.tagname in auto_closing_tags
	455	and not can_contain(self.current_tag.tagname, tagname)):
	456	self.current_tag = self.current_tag.close()
	457
	458	if not can_contain(self.current_tag.tagname, tagname):
	459	if self.current_tag.tagname == "table":
	460	sys.stderr.write("Inserting missing <tr> before <%s> at line %d\n" % (tagname, self.getpos()[0]))
	461	self.handle_starttag("tr", [])
	462	elif self.current_tag.tagname == "tr":
	463	sys.stderr.write("Inserting missing <td> before <%s> at line %d\n" % (tagname, self.getpos()[0]))
	464	self.handle_starttag("td", [])
	465
	466	attrs = dict(attrs)
	467	tag = Tag(tagname, attrs, self.current_tag, self.getpos())
	468	self.current_tag = tag
	469	if tagname in unbalanced_tags:
	470	self.current_tag = self.current_tag.close()
	471
	472	def handle_endtag(self, tagname):
	473	if tagname in ignored_tags or tagname in unbalanced_tags:
	474	return
	475
	476	if tagname == "title":
	477	self.title = self.current_tag.to_md_inline()
	478
	479	if tagname != self.current_tag.tagname and self.current_tag.is_inside(tagname):
	480	while tagname != self.current_tag.tagname:
	481	if (self.current_tag.tagname in auto_closing_tags
	482	and can_contain(tagname, self.current_tag.tagname)):
	483	sys.stderr.write("Auto-closing %s at line %d\n" % (self.current_tag.tagname, self.getpos()[0]))
	484	else:
	485	sys.stderr.write("Force-closing %s at line %d\n" % (self.current_tag.tagname, self.getpos()[0]))
	486	self.current_tag = self.current_tag.close()
	487
	488	if tagname == self.current_tag.tagname:
	489	self.current_tag = self.current_tag.close()
	490	else:
	491	sys.stderr.write("Found </%s> at line %d, expected </%s>.\n" % (tagname, self.getpos()[0], self.current_tag.tagname))
	492
	493	def handle_data(self, data):
	494	# Convert old troff-style quoting conventions to Unicode quotes.
	495	data = re.sub(r"``", u"\u201c", data)
	496	data = re.sub(r"''", u"\u201d", data)
	497	# Markup code terms like DW_AT_type and .debug_info.
	498	data = re.sub(r'(?<!\w)([.a-zA-Z]+_[a-zA-Z0-9_]+)\b', r'`\1`', data)
	499	self.current_tag.append_data(data)
	500
	501	def handle_entityref(self, name):
	502	self.current_tag.append_data(unichr(name2codepoint[name]))
	503
	504	def handle_charref(self, name):
	505	if name.startswith('x'):
	506	c = unichr(int(name[1:], 16))
	507	else:
	508	c = unichr(int(name))
	509	self.current_tag.append_data(c)
	510
	511	def debug(self):
	512	self.current_tag.debug("")
	513
	514	def to_md(self):
	515	return self.current_tag.to_md()
	516
	517	# Return a list of all descendent tags with the given tagname.
	518
	519	def find_tags(block, tagname):
	520	ret = []
	521	for e in block.elements:
	522	if isinstance(e, Tag):
	523	if e.tagname == tagname:
	524	ret.append(e)
	525	else:
	526	ret += find_tags(e, tagname)
	527	return ret
	528
	529	# The DWARF pages all start with two table elements as the page header,
	530	# and end with a table element as the page footer. Recognize these.
	531
	532	def is_header_footer(elem):
	533	if not isinstance(elem, Tag):
	534	return False
	535	if elem.tagname != "table":
	536	return False
	537	for cell in find_tags(elem, "td"):
	538	text = cell.to_md()
	539	if "[!dwarf.png]" in text:
	540	return True
	541	if "[HOME]" in text:
	542	return True
	543	if "All logos and trademarks" in text:
	544	return True
	545	return False
	546
	547	strip_header_footer = False
	548	convert_proposal = False
	549	dwarf_version = "6"
	550	debug = False
	551
	552	try:
	553	opts, args = getopt.getopt(sys.argv[1:], "hdpv:")
	554	except getopt.GetoptError as err:
	555	sys.stderr.write(err + "\n")
	556	sys.exit(2)
	557	for o, a in opts:
	558	if o == "-p":
	559	convert_proposal = True
	560	elif o == "-h":
	561	strip_header_footer = True
	562	elif o == "-v":
	563	dwarf_version = a
	564	elif o == "-d":
	565	debug = True
	566
	567	parser = HtmlToMD()
	568
	569	for l in sys.stdin:
	570	parser.feed(l.expandtabs(4).replace("\r\n", "\n"))
	571	parser.close()
	572
	573	if debug:
	574	parser.debug()
	575
	576	root = parser.current_tag
	577	if strip_header_footer:
	578	root.elements = list(filter(lambda e: not is_header_footer(e), root.elements))
	579
	580	if convert_proposal:
	581	table_elems = find_tags(root, "table")
	582	if len(table_elems) > 0:
	583	meta_info_tbl = table_elems[0]
	584	if len(meta_info_tbl.elements) > 0:
	585	first_row = meta_info_tbl.elements[0]
	586	meta_info = first_row.elements
	587	if len(meta_info) >= 6:
	588	prop_num = meta_info[0].to_md()
	589	prop_author = meta_info[1].to_md()
	590	prop_title = meta_info[2].to_md()
	591	prop_type = meta_info[3].to_md()
	592	prop_status = meta_info[4].to_md()
	593	prop_champion = meta_info[5].to_md()
	594	prop_submit_date = re.sub(r"(\d\d)(\d\d)(\d\d)\..*", r"20\1-\2-\3", prop_num)
	595	print("Title: %s" % prop_title)
	596	print("Author: %s" % prop_author)
	597	print("Champion: %s" % prop_champion)
	598	print("Submit-Date: %s" % prop_submit_date)
	599	print("Propid: %s" % prop_num)
	600	print("Type: %s" % prop_type)
	601	print("Status: %s" % prop_status)
	602	print("Version: %s" % dwarf_version)
	603	print("")
	604
	605	pre_elems = find_tags(parser.current_tag, "pre")
	606	if pre_elems:
	607	print(pre_elems[0].pre_to_md())
	608
	609	else:
	610	if parser.title:
	611	print("Title: %s" % parser.title)
	612	print("")
	613	print(root.to_md())