Skip to content

Commit 9301c39

Browse files
authored
Update Hacker News HTML text parsing logic (#99)
## Changes - update parsing logic to reflect new HN Algolia API change regarding the use of `<p>` for paragraph breaks - cleanup parsing codes
1 parent d09b1d2 commit 9301c39

File tree

5 files changed

+97
-83
lines changed

5 files changed

+97
-83
lines changed

hackernews_tui/src/client/mod.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,7 @@ impl HNClient {
7777
format!("get item (id={item_id}) using {request_url}")
7878
);
7979

80-
// The item's text returned from HN official APIs may have `<p>` tags representing
81-
// paragraph breaks. Convert `<p>` tags to newlines to make the text easier to read.
82-
let text = decode_html(&item.text.unwrap_or_default()).replace("<p>", "\n\n");
80+
let text = decode_html(&item.text.unwrap_or_default());
8381

8482
// Construct the shortened text to represent the page's title if not exist
8583
let chars = text.replace('\n', " ").chars().collect::<Vec<_>>();

hackernews_tui/src/model.rs

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,10 @@ pub struct VoteData {
5959
}
6060

6161
#[derive(Debug, Clone)]
62-
/// A HackerNews item which can be either a story or a comment.
62+
/// A Hacker News item which can be either a story or a comment.
6363
///
64-
/// This struct is a shared representation between a story and
65-
/// a comment for rendering the item's content.
64+
/// This struct is a shared representation between a story and a comment
65+
/// and is used to render their content.
6666
pub struct HnItem {
6767
pub id: u32,
6868
pub level: usize,
@@ -107,19 +107,18 @@ impl From<Story> for HnItem {
107107
),
108108
]);
109109

110-
let mut story_text = story.content;
110+
// parse story's HTML content
111+
let result = parse_hn_html_text(story.content, Style::default(), 0);
111112

112-
let minimized_text = if story_text.is_empty() {
113+
// construct a minimized text representing the collapsed story's content
114+
let minimized_text = if result.content.source().is_empty() {
113115
metadata.clone()
114116
} else {
115-
story_text = format!("\n{story_text}");
116-
117117
utils::combine_styled_strings([metadata.clone(), StyledString::plain("... (more)")])
118118
};
119119

120-
let mut text = metadata;
121-
let result = parse_hn_html_text(story_text, Style::default(), 0);
122-
text.append(result.s);
120+
let text =
121+
utils::combine_styled_strings([metadata, StyledString::plain("\n"), result.content]);
123122

124123
HnItem {
125124
id: story.id,
@@ -144,17 +143,20 @@ impl From<Comment> for HnItem {
144143
),
145144
]);
146145

147-
let mut text = utils::combine_styled_strings([metadata.clone(), StyledString::plain("\n")]);
146+
// constructs a minimized text representing the collapsed comment's content
148147
let minimized_text = utils::combine_styled_strings([
149-
metadata,
148+
metadata.clone(),
150149
StyledString::styled(
151150
format!("({} more)", comment.n_children + 1),
152151
component_style.metadata,
153152
),
154153
]);
155154

155+
// parse the comment's content
156156
let result = parse_hn_html_text(comment.content, Style::default(), 0);
157-
text.append(result.s);
157+
158+
let text =
159+
utils::combine_styled_strings([metadata, StyledString::plain("\n"), result.content]);
158160

159161
HnItem {
160162
id: comment.id,

hackernews_tui/src/parser/article.rs

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use super::html::HTMLParsedResult;
1+
use super::html::HTMLTextParsedResult;
22
use super::rcdom::{Handle, NodeData, RcDom};
33
use crate::parser::html::HTMLTableParsedResult;
44
use crate::prelude::*;
@@ -40,7 +40,7 @@ impl Article {
4040
/// # Arguments:
4141
/// * `max_width`: the maximum width of the parsed content. This is mostly used
4242
/// to construct a HTML table using `comfy_table`.
43-
pub fn parse(&self, max_width: usize) -> Result<HTMLParsedResult> {
43+
pub fn parse(&self, max_width: usize) -> Result<HTMLTextParsedResult> {
4444
debug!("parse article ({:?})", self);
4545

4646
// parse HTML content into DOM node(s)
@@ -90,21 +90,21 @@ impl Article {
9090
base_link_id: usize,
9191
mut style: Style,
9292
mut args: ArticleParseArgs,
93-
) -> (HTMLParsedResult, bool) {
93+
) -> (HTMLTextParsedResult, bool) {
9494
// TODO: handle parsing <ol> tags correctly
9595

9696
debug!(
9797
"parse dom node: {:?}, style: {:?}, args: {:?}",
9898
node, style, args
9999
);
100100

101-
let mut result = HTMLParsedResult::default();
101+
let mut result = HTMLTextParsedResult::default();
102102
let mut suffix = StyledString::new();
103103

104104
let mut visit_block_element_cb = || {
105105
if !args.is_first_element_in_block {
106-
result.s.append_plain("\n\n");
107-
result.s.append_styled(&args.prefix, style);
106+
result.content.append_plain("\n\n");
107+
result.content.append_styled(&args.prefix, style);
108108
}
109109
args.is_first_element_in_block = true;
110110
};
@@ -128,7 +128,7 @@ impl Article {
128128

129129
has_non_ws_text |= !text.trim().is_empty();
130130

131-
result.s.append_styled(text, style);
131+
result.content.append_styled(text, style);
132132
}
133133
NodeData::Element {
134134
ref name,
@@ -151,7 +151,9 @@ impl Article {
151151
style = style.combine(component_style.header);
152152
}
153153
expanded_name!(html "br") => {
154-
result.s.append_styled(format!("\n{}", args.prefix), style);
154+
result
155+
.content
156+
.append_styled(format!("\n{}", args.prefix), style);
155157
}
156158
expanded_name!(html "p") => visit_block_element_cb(),
157159
expanded_name!(html "code") => {
@@ -169,15 +171,15 @@ impl Article {
169171

170172
style = style.combine(component_style.multiline_code_block);
171173

172-
result.s.append_styled(" ", style);
174+
result.content.append_styled(" ", style);
173175
}
174176
expanded_name!(html "blockquote") => {
175177
visit_block_element_cb();
176178

177179
args.prefix = format!("{}▎ ", args.prefix);
178180
style = style.combine(component_style.quote);
179181

180-
result.s.append_styled("▎ ", style);
182+
result.content.append_styled("▎ ", style);
181183
}
182184
expanded_name!(html "table") => {
183185
let mut table_result = HTMLTableParsedResult::default();
@@ -211,7 +213,7 @@ impl Article {
211213
table.add_row(row.into_iter().map(|c| c.source().to_owned()));
212214
}
213215

214-
result.s.append_styled(format!("\n\n{table}"), style);
216+
result.content.append_styled(format!("\n\n{table}"), style);
215217

216218
return (result, true);
217219
}
@@ -225,7 +227,7 @@ impl Article {
225227
args.is_first_element_in_block = true;
226228

227229
result
228-
.s
230+
.content
229231
.append_styled(format!("\n{}• ", args.prefix), style);
230232
}
231233
expanded_name!(html "img") => {
@@ -240,10 +242,12 @@ impl Article {
240242
};
241243

242244
if !args.is_first_element_in_block {
243-
result.s.append_plain("\n\n");
245+
result.content.append_plain("\n\n");
244246
}
245-
result.s.append_styled(&img_desc, style);
246-
result.s.append_styled(" (image)", component_style.metadata);
247+
result.content.append_styled(&img_desc, style);
248+
result
249+
.content
250+
.append_styled(" (image)", component_style.metadata);
247251
}
248252
expanded_name!(html "a") => {
249253
// find `href` attribute of an <a> tag
@@ -291,7 +295,7 @@ impl Article {
291295
}
292296
});
293297

294-
result.s.append(suffix);
298+
result.content.append(suffix);
295299
(result, has_non_ws_text)
296300
}
297301

@@ -331,7 +335,7 @@ impl Article {
331335
);
332336

333337
result.links.append(&mut child_result.links);
334-
s.append(child_result.s);
338+
s.append(child_result.content);
335339
});
336340

337341
if !is_header {

0 commit comments

Comments
 (0)