mirror of
https://github.com/koreader/koreader.git
synced 2025-12-13 20:36:53 +01:00
[NewsDownloader] Added an HTML filter through a CSS selector (#6228)
Fixes #6185.
This commit is contained in:
@@ -24,6 +24,54 @@ local max_redirects = 5; --prevent infinite redirects
|
||||
local TIMEOUT_CODE = "timeout" -- from socket.lua
|
||||
local MAXTIME_CODE = "maxtime reached" -- from sink_table_with_maxtime
|
||||
|
||||
-- filter HTML using CSS selector
|
||||
local function filter(text, element)
|
||||
local htmlparser = require("htmlparser")
|
||||
local root = htmlparser.parse(text, 5000)
|
||||
local filtered = nil
|
||||
local selectors = {
|
||||
"main",
|
||||
"article",
|
||||
"div#main",
|
||||
"#main-article",
|
||||
".main-content",
|
||||
"#body",
|
||||
"#content",
|
||||
".content",
|
||||
"div#article",
|
||||
"div.article",
|
||||
"div.post",
|
||||
"div.post-outer",
|
||||
".l-root",
|
||||
".content-container",
|
||||
".StandardArticleBody_body",
|
||||
"div#article-inner",
|
||||
"div#newsstorytext",
|
||||
"div.general",
|
||||
}
|
||||
if element then
|
||||
table.insert(selectors, 1, element)
|
||||
end
|
||||
for _, sel in ipairs(selectors) do
|
||||
local elements = root:select(sel)
|
||||
if elements then
|
||||
for _, e in ipairs(elements) do
|
||||
filtered = e:getcontent()
|
||||
if filtered then
|
||||
break
|
||||
end
|
||||
end
|
||||
if filtered then
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
if not filtered then
|
||||
return text
|
||||
end
|
||||
return "<!DOCTYPE html><html><head></head><body>" .. filtered .. "</body></html>"
|
||||
end
|
||||
|
||||
-- Sink that stores into a table, aborting if maxtime has elapsed
|
||||
local function sink_table_with_maxtime(t, maxtime)
|
||||
-- Start counting as soon as this sink is created
|
||||
@@ -181,15 +229,13 @@ local ext_to_mimetype = {
|
||||
ttf = "application/truetype",
|
||||
woff = "application/font-woff",
|
||||
}
|
||||
|
||||
-- Create an epub file (with possibly images)
|
||||
function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message)
|
||||
function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message, filter_enable, filter_element)
|
||||
logger.dbg("EpubDownloadBackend:createEpub(", epub_path, ")")
|
||||
-- Use Trapper to display progress and ask questions through the UI.
|
||||
-- We need to have been Trapper.wrap()'ed for UI to be used, otherwise
|
||||
-- Trapper:info() and Trapper:confirm() will just use logger.
|
||||
local UI = require("ui/trapper")
|
||||
|
||||
-- We may need to build absolute urls for non-absolute links and images urls
|
||||
local base_url = socket_url.parse(url)
|
||||
|
||||
@@ -201,7 +247,7 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me
|
||||
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
|
||||
-- should it changes if content is updated (as now, including the wikipedia revisionId),
|
||||
-- or should it stays the same even if revid changes (content of the same book updated).
|
||||
|
||||
if filter_enable then html = filter(html, filter_element) end
|
||||
local images = {}
|
||||
local seen_images = {}
|
||||
local imagenum = 1
|
||||
|
||||
@@ -21,12 +21,19 @@ return {--do NOT change this line
|
||||
-- 'include_images=false' - means ignore any images, only download the text (faster download, smaller file sizes)
|
||||
-- default value is 'false' (if no 'include_images' entry)
|
||||
|
||||
-- 'enable_filter=true' - means filter using a CSS selector to delimit part of the page to just that (does not apply if download_full_article=false)
|
||||
-- 'enable_filter=false' - means no such filtering and including the full page
|
||||
-- default value is 'false'
|
||||
|
||||
-- 'filter_element="name_of_css.element.class" - means to filter the chosen CSS selector, it can be easily picked using a modern web browser
|
||||
-- The default value is empty. The default list of common selectors is used as fallback if this value is set.
|
||||
|
||||
-- comment out line ("--" at line start) to stop downloading source
|
||||
|
||||
|
||||
-- LIST YOUR FEEDS HERE:
|
||||
|
||||
{ "http://feeds.reuters.com/Reuters/worldNews?format=xml", limit = 2, download_full_article=true},
|
||||
{ "http://feeds.reuters.com/Reuters/worldNews?format=xml", limit = 2, download_full_article=true, include_images=true, enable_filter=true},
|
||||
|
||||
{ "https://www.pcworld.com/index.rss", limit = 7 , download_full_article=false},
|
||||
|
||||
|
||||
@@ -199,10 +199,12 @@ function NewsDownloader:loadConfigAndProcessFeeds()
|
||||
local limit = feed.limit
|
||||
local download_full_article = feed.download_full_article == nil or feed.download_full_article
|
||||
local include_images = not never_download_images and feed.include_images
|
||||
local enable_filter = feed.enable_filter or feed.enable_filter == nil
|
||||
local filter_element = feed.filter_element or feed.filter_element == nil
|
||||
if url and limit then
|
||||
local feed_message = T(_("Processing %1/%2:\n%3"), idx, total_feed_entries, BD.url(url))
|
||||
UI:info(feed_message)
|
||||
NewsDownloader:processFeedSource(url, tonumber(limit), unsupported_feeds_urls, download_full_article, include_images, feed_message)
|
||||
NewsDownloader:processFeedSource(url, tonumber(limit), unsupported_feeds_urls, download_full_article, include_images, feed_message, enable_filter, filter_element)
|
||||
else
|
||||
logger.warn('NewsDownloader: invalid feed config entry', feed)
|
||||
end
|
||||
@@ -230,7 +232,7 @@ function NewsDownloader:loadConfigAndProcessFeedsWithUI()
|
||||
end)
|
||||
end
|
||||
|
||||
function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, download_full_article, include_images, message)
|
||||
function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, download_full_article, include_images, message, enable_filter, filter_element)
|
||||
|
||||
local ok, response = pcall(function()
|
||||
return DownloadBackend:getResponseAsString(url)
|
||||
@@ -250,11 +252,11 @@ function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, do
|
||||
|
||||
if is_atom then
|
||||
ok = pcall(function()
|
||||
return self:processAtom(feeds, limit, download_full_article, include_images, message)
|
||||
return self:processAtom(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
|
||||
end)
|
||||
elseif is_rss then
|
||||
ok = pcall(function()
|
||||
return self:processRSS(feeds, limit, download_full_article, include_images, message)
|
||||
return self:processRSS(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
|
||||
end)
|
||||
end
|
||||
if not ok or (not is_rss and not is_atom) then
|
||||
@@ -280,7 +282,7 @@ function NewsDownloader:deserializeXMLString(xml_str)
|
||||
return xmlhandler.root
|
||||
end
|
||||
|
||||
function NewsDownloader:processAtom(feeds, limit, download_full_article, include_images, message)
|
||||
function NewsDownloader:processAtom(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
|
||||
local feed_output_dir = string.format("%s%s/",
|
||||
news_download_dir_path,
|
||||
util.getSafeFilename(getFeedTitle(feeds.feed.title)))
|
||||
@@ -294,14 +296,14 @@ function NewsDownloader:processAtom(feeds, limit, download_full_article, include
|
||||
end
|
||||
local article_message = T(_("%1\n\nFetching article %2/%3:"), message, index, limit == 0 and #feeds.rss.channel.item or limit)
|
||||
if download_full_article then
|
||||
self:downloadFeed(feed, feed_output_dir, include_images, article_message)
|
||||
self:downloadFeed(feed, feed_output_dir, include_images, article_message, enable_filter, filter_element)
|
||||
else
|
||||
self:createFromDescription(feed, feed.content[1], feed_output_dir, include_images, article_message)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
function NewsDownloader:processRSS(feeds, limit, download_full_article, include_images, message)
|
||||
function NewsDownloader:processRSS(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
|
||||
local feed_output_dir = ("%s%s/"):format(
|
||||
news_download_dir_path, util.getSafeFilename(util.htmlEntitiesToUtf8(feeds.rss.channel.title)))
|
||||
if not lfs.attributes(feed_output_dir, "mode") then
|
||||
@@ -314,7 +316,7 @@ function NewsDownloader:processRSS(feeds, limit, download_full_article, include_
|
||||
end
|
||||
local article_message = T(_("%1\n\nFetching article %2/%3:"), message, index, limit == 0 and #feeds.rss.channel.item or limit)
|
||||
if download_full_article then
|
||||
self:downloadFeed(feed, feed_output_dir, include_images, article_message)
|
||||
self:downloadFeed(feed, feed_output_dir, include_images, article_message, enable_filter, filter_element)
|
||||
else
|
||||
self:createFromDescription(feed, feed.description, feed_output_dir, include_images, article_message)
|
||||
end
|
||||
@@ -341,7 +343,7 @@ local function getTitleWithDate(feed)
|
||||
return title
|
||||
end
|
||||
|
||||
function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, message)
|
||||
function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, message, enable_filter, filter_element)
|
||||
local title_with_date = getTitleWithDate(feed)
|
||||
local news_file_path = ("%s%s%s"):format(feed_output_dir,
|
||||
title_with_date,
|
||||
@@ -355,7 +357,7 @@ function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, mess
|
||||
local article_message = T(_("%1\n%2"), message, title_with_date)
|
||||
local link = getFeedLink(feed.link)
|
||||
local html = DownloadBackend:loadPage(link)
|
||||
DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message)
|
||||
DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message, enable_filter, filter_element)
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
Reference in New Issue
Block a user