diff options
author | Tuomas Siipola | 2020-01-07 20:46:59 +0200 |
---|---|---|
committer | Tuomas Siipola | 2020-01-07 20:46:59 +0200 |
commit | 362c1fef908b991a750a1945468b2df9bab6ed17 (patch) | |
tree | 541588ed7e0f3078f281a3707794c57687b92b7f | |
parent | 36bd71274d0e302405541e26150c35f6f0c63c5b (diff) |
-rw-r--r-- | backend/src/feed_parser.rs | 249 |
1 files changed, 128 insertions, 121 deletions
diff --git a/backend/src/feed_parser.rs b/backend/src/feed_parser.rs index 43f3439..8822f31 100644 --- a/backend/src/feed_parser.rs +++ b/backend/src/feed_parser.rs @@ -38,6 +38,9 @@ fn sanitize_html(input: &str) -> String { builder.clean(input).to_string() } +const ATOM_NS: &str = "http://www.w3.org/2005/Atom"; +const DC_NS: &str = "http://purl.org/dc/elements/1.1/"; + impl Feed { pub fn parse_rss(rss: &str) -> Result<Feed, String> { let doc = Document::parse(rss).map_err(|e| e.to_string())?; @@ -56,77 +59,79 @@ impl Feed { let mut items = Vec::new(); for child in channel.children() { - if child.is_element() { - match child.tag_name().name() { - "title" => title = child.text().map(str::trim).map(str::to_string), - "description" => description = child.text().map(str::trim).map(str::to_string), - "link" => link = child.text().map(str::trim).map(str::to_string), - "item" => { - let mut guid = None; - let mut title = None; - let mut description = None; - let mut link = None; - let mut author = None; - let mut published_at = None; - for child in child.children() { - match child.tag_name().name() { - "guid" => { - if let Some(text) = child.text().map(str::trim) { - guid = Some(text.to_string()); - if link.is_none() { - match child.attribute("isPermaLink") { - None | Some("true") => { - link = Some(text.to_string()) - } - _ => {} - } + if !child.is_element() || child.tag_name().namespace().is_some() { + continue; + } + match child.tag_name().name() { + "title" => title = child.text().map(str::trim).map(str::to_string), + "description" => description = child.text().map(str::trim).map(str::to_string), + "link" => link = child.text().map(str::trim).map(str::to_string), + "item" => { + let mut guid = None; + let mut title = None; + let mut description = None; + let mut link = None; + let mut author = None; + let mut published_at = None; + for child in child.children() { + if !child.is_element() { + continue; + } + match (child.tag_name().namespace(), child.tag_name().name()) { + (None, "guid") => { + if let Some(text) = child.text().map(str::trim) { + guid = Some(text.to_string()); + if link.is_none() { + match child.attribute("isPermaLink") { + None | Some("true") => link = Some(text.to_string()), + _ => {} } } } - "title" => title = child.text().map(str::trim).map(str::to_string), - "description" => description = child.text().map(sanitize_html), - "link" => link = child.text().map(str::trim).map(str::to_string), - "author" => { - // Fallback if `dc:creator` is not defined later. - if author.is_none() { - author = child.text().map(str::trim).map(str::to_string) - } - } - "creator" => { - if child.tag_name().namespace() - == Some("http://purl.org/dc/elements/1.1/") - { - author = child.text().map(str::trim).map(str::to_string); - } - } - "pubDate" => { - published_at = child.text().map(str::trim).and_then(|text| { - DateTime::parse_from_rfc2822(text) - .map(|dt| dt.naive_utc()) - .ok() - }); + } + (None, "title") => { + title = child.text().map(str::trim).map(str::to_string) + } + (None, "description") => description = child.text().map(sanitize_html), + (None, "link") => { + link = child.text().map(str::trim).map(str::to_string) + } + (None, "author") => { + // Fallback if `dc:creator` is not defined later. + if author.is_none() { + author = child.text().map(str::trim).map(str::to_string) } - _ => {} } + (Some(DC_NS), "creator") => { + author = child.text().map(str::trim).map(str::to_string); + } + (None, "pubDate") => { + published_at = child.text().map(str::trim).and_then(|text| { + DateTime::parse_from_rfc2822(text) + .map(|dt| dt.naive_utc()) + .ok() + }); + } + _ => {} } - items.push(Item { - guid: guid - .or_else(|| { - link.as_ref() - .or_else(|| title.as_ref()) - .or_else(|| description.as_ref()) - .map(String::clone) - }) - .ok_or_else(|| "item element without identifier")?, - title, - description, - link, - author, - published_at, - }); } - _ => {} + items.push(Item { + guid: guid + .or_else(|| { + link.as_ref() + .or_else(|| title.as_ref()) + .or_else(|| description.as_ref()) + .map(String::clone) + }) + .ok_or_else(|| "item element without identifier")?, + title, + description, + link, + author, + published_at, + }); } + _ => {} } } @@ -151,72 +156,74 @@ impl Feed { let mut items = Vec::new(); for child in feed.children() { - if child.is_element() { - match child.tag_name().name() { - "title" => title = child.text().map(str::trim).map(str::to_string), - "subtitle" => description = child.text().map(str::trim).map(str::to_string), - "link" => match child.attribute("rel") { - Some("alternate") | None => { - link = child.attribute("href").map(str::to_string) + if !child.is_element() || child.tag_name().namespace() != Some(ATOM_NS) { + continue; + } + match child.tag_name().name() { + "title" => title = child.text().map(str::trim).map(str::to_string), + "subtitle" => description = child.text().map(str::trim).map(str::to_string), + "link" => match child.attribute("rel") { + Some("alternate") | None => link = child.attribute("href").map(str::to_string), + _ => {} + }, + "entry" => { + let mut guid = None; + let mut title = None; + let mut description = None; + let mut link = None; + let mut author = None; + let mut published_at = None; + for child in child.children() { + if !child.is_element() || child.tag_name().namespace() != Some(ATOM_NS) { + continue; } - _ => {} - }, - "entry" => { - let mut guid = None; - let mut title = None; - let mut description = None; - let mut link = None; - let mut author = None; - let mut published_at = None; - for child in child.children() { - match child.tag_name().name() { - "id" => guid = child.text().map(str::trim).map(str::to_string), - "title" => title = child.text().map(str::trim).map(str::to_string), - "content" => description = child.text().map(sanitize_html), - "link" => match child.attribute("rel") { - Some("alternate") | None => { - link = child.attribute("href").map(str::to_string) - } - _ => {} - }, - "author" => { - for child in child.children() { - match child.tag_name().name() { - "name" => { - author = - child.text().map(str::trim).map(str::to_string); - break; - } - "email" => { - author = - child.text().map(str::trim).map(str::to_string); - // Fallback if `name` is not defined later. - } - _ => {} + match child.tag_name().name() { + "id" => guid = child.text().map(str::trim).map(str::to_string), + "title" => title = child.text().map(str::trim).map(str::to_string), + "content" => description = child.text().map(sanitize_html), + "link" => match child.attribute("rel") { + Some("alternate") | None => { + link = child.attribute("href").map(str::to_string) + } + _ => {} + }, + "author" => { + for child in child.children() { + match child.tag_name().name() { + "name" => { + author = + child.text().map(str::trim).map(str::to_string); + break; } + "email" => { + author = + child.text().map(str::trim).map(str::to_string); + // Fallback if `name` is not defined later. + } + _ => {} } } - "updated" => { - published_at = child.text().map(str::trim).and_then(|text| { - DateTime::parse_from_rfc3339(text) - .map(|dt| dt.naive_utc()) - .ok() - }); - } - _ => {} } + "updated" => { + published_at = child.text().map(str::trim).and_then(|text| { + DateTime::parse_from_rfc3339(text) + .map(|dt| dt.naive_utc()) + .ok() + }); + } + _ => {} } - items.push(Item { - guid: guid.ok_or_else(|| "entry element without identifier")?, - title, - description, - link, - author, - published_at, - }); } - _ => {} + items.push(Item { + guid: guid.ok_or_else(|| "entry element without identifier")?, + title, + description, + link, + author, + published_at, + }); } + _ => {} } } |