Skip to content

Commit

Permalink
support finding feed URL from HTML page
Browse files Browse the repository at this point in the history
  • Loading branch information
fanzeyi committed Sep 8, 2020
1 parent 7b06b88 commit 53f78b9
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 1 deletion.
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ quick-xml = "0.18.1"
log = "0.4.11"
femme = "2.1.0"
url = "2.1.1"
either = "1.5.3"

[dev-dependencies]
rand = "0.7"
Expand Down
77 changes: 76 additions & 1 deletion src/cli.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
use anyhow::{anyhow, Context, Result};
use async_std::prelude::FutureExt;
use either::Either;
use futures::stream::{self, StreamExt};
use log::{info, warn};
use prettytable::{cell, format, row, Table};
use std::io::{self, BufRead, Write};
use std::path::PathBuf;
use structopt::StructOpt;

Expand Down Expand Up @@ -51,6 +53,74 @@ impl FeedCommand {
Ok(())
}

async fn select_remotes(state: &State, candidates: Vec<String>) -> Result<RemoteFeed> {
if candidates.is_empty() {
return Err(anyhow!(
"Supplied URL is not a feed, and we can't find any potential candidate in the page."
));
}

let length = candidates.len();
if length == 1 {
let url = candidates.first().unwrap();
log::info!(
"Supplied URL is not a feed, but we found a potential candidate: {}",
url
);
return Ok(RemoteFeed::new(url).await?);
}

println!(
"Supplied URL is not a feed, but we found {} potential candidates. Please select one:",
length
);

for (idx, url) in candidates.iter().enumerate() {
println!("{}) {}", idx, url);
}

let stdin = io::stdin();
loop {
print!("select (0-{}, c to cancel): ", length - 1);
io::stdout().flush()?;

let mut selection = String::new();
stdin.lock().read_line(&mut selection)?;

let selection = selection.trim();
if selection == "c" {
break Err(anyhow!("No selection was made."));
}

match selection.parse::<usize>() {
Ok(select) if select < length => {
let url = candidates.get(select).unwrap();

let feed = {
let conn = state.db.get()?;
Feed::get_by_url(&conn, &url)?
};

if feed.is_some() {
println!("Error: Invalid selection: selected feed already exists");
continue;
}

match RemoteFeed::new(candidates.get(select).unwrap()).await {
Ok(feed) => break Ok(feed),
Err(e) => println!("Error: Selection is not a feed: {}", e),
}
}
Ok(_) => {
println!("Error: Invalid selection: out of range");
}
Err(e) => {
println!("Error: Invalid selection: {}", e);
}
}
}
}

async fn add(state: State, url: String, group: Option<String>) -> Result<()> {
let feed = {
let conn = state.db.get()?;
Expand All @@ -61,7 +131,12 @@ impl FeedCommand {
return Err(anyhow!("Feed `{}` already exists!", url));
}

let remote = RemoteFeed::new(&url).await?;
let remote = match RemoteFeed::try_new(&url).await? {
Either::Left(remote) => remote,
Either::Right(candidates) => Self::select_remotes(&state, candidates).await?,
};

let url = remote.get_url().to_owned();

let feed = Feed::new(
remote
Expand Down
56 changes: 56 additions & 0 deletions src/find.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/// Finds Feed urls on a web page.
use quick_xml::events::Event;
use quick_xml::Reader;
use std::io::BufRead;

use crate::error::Result;

/// Parses HTML page to find `<link rel="alternate" />` and extract hrefs.
pub fn find_rel_alternates<B: BufRead>(reader: B) -> Result<Vec<String>> {
let mut reader = Reader::from_reader(reader);
reader.check_end_names(false);

let mut buf = Vec::new();
let mut result = Vec::new();

loop {
match reader.read_event(&mut buf) {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) if e.name() == b"link" => {
if e.attributes().fold(false, |acc, attr| {
if acc {
acc
} else if let Ok(attr) = attr {
attr.key == b"rel" && attr.value.as_ref() == b"alternate"
} else {
false
}
}) {
if let Some(url) = e
.attributes()
.filter_map(|attr| {
if let Ok(attr) = attr {
if attr.key == b"href" {
String::from_utf8(attr.value.into_owned()).ok()
} else {
None
}
} else {
None
}
})
.next()
{
result.push(url);
}
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err((e, reader.buffer_position()).into());
}
_ => (),
}
}

Ok(result)
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mod error;
mod api;
mod cli;
mod crawler;
mod find;
pub mod model;
mod opml;
mod remote;
Expand Down
19 changes: 19 additions & 0 deletions src/remote.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use either::Either;

use crate::error::Result;
use crate::find::find_rel_alternates;

pub struct RemoteFeed {
url: String,
Expand All @@ -16,6 +19,18 @@ impl RemoteFeed {
})
}

/// Attempts to fetch and parse feed from the given url
pub async fn try_new(url: &str) -> Result<Either<Self, Vec<String>>> {
let bytes = surf::get(url).await?.body_bytes().await?;
match feed_rs::parser::parse(&bytes[..]) {
Ok(feed) => Ok(Either::Left(RemoteFeed {
url: url.to_owned(),
feed,
})),
Err(_) => Ok(Either::Right(find_rel_alternates(&bytes[..])?)),
}
}

pub fn get_title(&self) -> Option<String> {
self.feed.title.as_ref().map(|t| t.content.clone())
}
Expand All @@ -29,4 +44,8 @@ impl RemoteFeed {
.next()
.map(|x| x.to_owned())
}

pub fn get_url(&self) -> &str {
&self.url
}
}

0 comments on commit 53f78b9

Please sign in to comment.