Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions data/muddy_waters.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<p>
<b>McKinley Morganfield</b> (April 4, 1914&nbsp;– April 30, 1983),<sup
id="cite_ref-1"
class="reference"
><a href="#cite_note-1"
><span class="cite-bracket">[</span>1<span class="cite-bracket"
>]</span
></a
></sup
><sup id="cite_ref-FOOTNOTEGordon20024–5_2-0" class="reference"
><a href="#cite_note-FOOTNOTEGordon20024–5-2"
><span class="cite-bracket">[</span>2<span class="cite-bracket"
>]</span
></a
></sup
>
better known as <b>Muddy Waters</b> was an American
<a href="/wiki/Blues" title="Blues">blues</a>
<a href="/wiki/Singer-songwriter" title="Singer-songwriter"
>singer-songwriter</a
>
and musician who was an important figure in the post-World War II blues
scene, and is often cited as the "father of modern
<a href="/wiki/Chicago_blues" title="Chicago blues">Chicago blues</a>".<sup
id="cite_ref-cantbe-dvd_3-0"
class="reference"
><a href="#cite_note-cantbe-dvd-3"
><span class="cite-bracket">[</span>3<span class="cite-bracket"
>]</span
></a
></sup
>
His style of playing has been described as "raining down
<a href="/wiki/Mississippi_Delta" title="Mississippi Delta">Delta</a>
beatitude".<sup id="cite_ref-4" class="reference"
><a href="#cite_note-4"
><span class="cite-bracket">[</span>4<span class="cite-bracket"
>]</span
></a
></sup
>
</p>
13 changes: 12 additions & 1 deletion src/dom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,18 @@ pub fn extract_text(handle: Handle, text: &mut String, deep: bool) {
let c = child.clone();
match c.data {
Text { ref contents } => {
text.push_str(contents.borrow().as_ref());
let cc = contents.borrow().to_string();

if text.len() > 0 && text.chars().last().unwrap() != ' ' {
let needs_space = match text.trim().chars().last().unwrap() {
'.' | '!' | ',' | '"' | '\'' => false,
_ => true,
};
if needs_space {
text.push_str(" ");
}
}
text.push_str(cc.trim());
}
Element { .. } => {
if deep {
Expand Down
2 changes: 1 addition & 1 deletion src/scorer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ pub static PUNCTUATIONS_REGEX: &str = r"([、。,.!?]|\.[^A-Za-z0-9]|,[^
pub static UNLIKELY_CANDIDATES: &str = "combx|comment|community|disqus|extra|foot|header|menu\
|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate\
|pagination|pager|popup|tweet|twitter\
|ssba";
|ssba|mw-editsection|cite_ref-";
pub static LIKELY_CANDIDATES: &str = "and|article|body|column|main|shadow\
|content|hentry";
pub static POSITIVE_CANDIDATES: &str = "article|body|content|entry|hentry|main|page\
Expand Down
13 changes: 13 additions & 0 deletions tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,16 @@ fn test_fix_img_links() {
let product = readability::extractor::extract(&mut file, &url).unwrap();
assert_eq!(product.content, "<!DOCTYPE html><html><head><title>This is title</title></head><body><p><img src=\"https://example.com/poop.png\"></p></body></html>");
}

#[test]
fn test_extract_text() {
// previous result: better known asMuddy Waterswas an Americanbluessinger-songwriterand musician who was an important figure
// new result: better known as Muddy Waters was an American blues singer-songwriter and musician who was an important figure
let mut file = File::open("./data/muddy_waters.html").unwrap();
let url = Url::parse("https://example.com").unwrap();
let product = readability::extractor::extract(&mut file, &url).unwrap();
println!("{}", product.text);
let expected = r#"McKinley Morganfield (April 4, 1914 – April 30, 1983),better known as Muddy Waters was an American blues singer-songwriter and musician who was an important figure in the post-World War II blues
scene, and is often cited as the "father of modern Chicago blues ".His style of playing has been described as "raining down Delta beatitude"."#;
assert_eq!(expected, product.text);
}