mirror of
https://github.com/koreader/koreader.git
synced 2025-12-13 20:36:53 +01:00
[plugin] NewsDownloader: block CSS elements (#14383)
This commit is contained in:
@@ -13,6 +13,19 @@ local time = require("ui/time")
|
||||
local _ = require("gettext")
|
||||
local T = ffiutil.template
|
||||
|
||||
local function removeSubstring(str, substr)
|
||||
local iter = 1
|
||||
local i, j
|
||||
repeat
|
||||
i, j = string.find(str, substr, iter, true)
|
||||
if i then
|
||||
str = string.sub(str, 1, i-1) .. string.sub(str, j+1, -1)
|
||||
iter = i
|
||||
end
|
||||
until not i
|
||||
return str
|
||||
end
|
||||
|
||||
local EpubDownloadBackend = {
|
||||
-- Can be set so HTTP requests will be done under Trapper and
|
||||
-- be interruptible
|
||||
@@ -29,11 +42,27 @@ local FeedCache = CacheSQLite:new{
|
||||
size = 1024 * 1024 * 10, -- 10MB
|
||||
}
|
||||
|
||||
-- Get HTML elements sorted by level.
|
||||
local function selectSorted(root, elements)
|
||||
local elements_sorted = {}
|
||||
for _, sel in ipairs(elements) do
|
||||
local selected = root:select(sel)
|
||||
if selected then
|
||||
for _, e in ipairs(selected) do
|
||||
table.insert(elements_sorted, e)
|
||||
end
|
||||
end
|
||||
end
|
||||
table.sort(elements_sorted, function(a, b) return a.level < b.level end)
|
||||
return elements_sorted
|
||||
end
|
||||
|
||||
-- filter HTML using CSS selector
|
||||
local function filter(text, element)
|
||||
local function selectAndCleanHTML(text, filter_element, block_element)
|
||||
local htmlparser = require("htmlparser")
|
||||
local root = htmlparser.parse(text, 5000)
|
||||
local filtered = nil
|
||||
local filtered_e = nil
|
||||
local selectors = {
|
||||
"main",
|
||||
"article",
|
||||
@@ -54,33 +83,59 @@ local function filter(text, element)
|
||||
"div#newsstorytext",
|
||||
"div.general",
|
||||
}
|
||||
if type(element) == "string" and element ~= "" then
|
||||
table.insert(selectors, 1, element) -- Insert string at the beginning
|
||||
elseif type(element) == "table" then
|
||||
for _, el in ipairs(element) do
|
||||
local annoyances = {
|
||||
"div.article__social",
|
||||
"figure.is-type-video",
|
||||
}
|
||||
if type(filter_element) == "string" and filter_element ~= "" then
|
||||
table.insert(selectors, 1, filter_element) -- Insert string at the beginning
|
||||
elseif type(filter_element) == "table" then
|
||||
for _, el in ipairs(filter_element) do
|
||||
if type(el) == "string" and el ~= "" then
|
||||
table.insert(selectors, 1, el) -- Insert each non-empty element at the beginning
|
||||
end
|
||||
end
|
||||
end
|
||||
for _, sel in ipairs(selectors) do
|
||||
local elements = root:select(sel)
|
||||
if elements then
|
||||
for _, e in ipairs(elements) do
|
||||
filtered = e:getcontent()
|
||||
if filtered then
|
||||
local elements = root:select(sel)
|
||||
if elements then
|
||||
for _, e in ipairs(elements) do
|
||||
if e:getcontent() then
|
||||
filtered_e = e
|
||||
break
|
||||
end
|
||||
end
|
||||
if filtered then
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
if filtered then
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
if not filtered then
|
||||
return text
|
||||
if not filtered_e then
|
||||
filtered_e = root
|
||||
filtered = text
|
||||
else
|
||||
filtered = "<!DOCTYPE html><html><head></head><body>" .. filtered_e:getcontent() .. "</body></html>"
|
||||
end
|
||||
return "<!DOCTYPE html><html><head></head><body>" .. filtered .. "</body></html>"
|
||||
-- blocking
|
||||
if type(block_element) == "string" and block_element ~= "" then
|
||||
annoyances = { block_element }
|
||||
elseif type(block_element) == "table" then
|
||||
local custom_annoyances = {}
|
||||
for _, el in ipairs(block_element) do
|
||||
if type(el) == "string" and el ~= "" then
|
||||
table.insert(custom_annoyances, 1, el)
|
||||
end
|
||||
end
|
||||
if next(custom_annoyances) then -- there is at least one valid component
|
||||
annoyances = custom_annoyances
|
||||
end
|
||||
end
|
||||
-- Removing deeper elements may modify text inside others, making patterns not match.
|
||||
annoyances = selectSorted(filtered_e, annoyances)
|
||||
for _, e in ipairs(annoyances) do
|
||||
filtered = removeSubstring(filtered, e:gettext())
|
||||
end
|
||||
return filtered
|
||||
end
|
||||
|
||||
-- From https://github.com/lunarmodules/luasocket/blob/1fad1626900a128be724cba9e9c19a6b2fe2bf6b/samples/cookie.lua
|
||||
@@ -321,7 +376,7 @@ local ext_to_mimetype = {
|
||||
woff = "application/font-woff",
|
||||
}
|
||||
-- Create an epub file (with possibly images)
|
||||
function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message, filter_enable, filter_element)
|
||||
function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message, filter_enable, filter_element, block_element)
|
||||
logger.dbg("EpubDownloadBackend:createEpub(", epub_path, ")")
|
||||
-- Use Trapper to display progress and ask questions through the UI.
|
||||
-- We need to have been Trapper.wrap()'ed for UI to be used, otherwise
|
||||
@@ -343,7 +398,7 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me
|
||||
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
|
||||
-- should it changes if content is updated (as now, including the wikipedia revisionId),
|
||||
-- or should it stays the same even if revid changes (content of the same book updated).
|
||||
if filter_enable then html = filter(html, filter_element) end
|
||||
if filter_enable then html = selectAndCleanHTML(html, filter_element, block_element) end
|
||||
local images = {}
|
||||
local seen_images = {}
|
||||
local imagenum = 1
|
||||
|
||||
@@ -21,13 +21,16 @@ return {--do NOT change this line
|
||||
-- 'include_images=false' - means ignore any images, only download the text (faster download, smaller file sizes)
|
||||
-- default value is 'false' (if no 'include_images' entry)
|
||||
|
||||
-- 'enable_filter=true' - means filter using a CSS selector to delimit part of the page to just that (does not apply if download_full_article=false)
|
||||
-- 'enable_filter=true' - means filter using a CSS selector to delimit part of the page to just that and removes broken/annoying elements (does not apply if download_full_article=false)
|
||||
-- 'enable_filter=false' - means no such filtering and including the full page
|
||||
-- default value is 'false'
|
||||
|
||||
-- 'filter_element="name_of_css.element.class" - means to filter the chosen CSS selector, it can be easily picked using a modern web browser
|
||||
-- The default value is empty. The default list of common selectors is used as fallback if this value is set.
|
||||
|
||||
-- 'block_element="name_of_css.element.class" - means to remove the chosen CSS element, it can be easily picked using a modern web browser
|
||||
-- The default value is empty. The default list of common annoyances is used as fallback if this value is set.
|
||||
|
||||
-- Optional 'credentials' element is used to authenticate on subscription based articles.
|
||||
-- It is itself comprised of a 'url' strings, that is the url of the connexion form,
|
||||
-- and an 'auth' table that contains form data used for user authentication {form_key = value, …}.
|
||||
@@ -38,7 +41,7 @@ return {--do NOT change this line
|
||||
|
||||
-- LIST YOUR FEEDS HERE:
|
||||
|
||||
{ "https://github.com/koreader/koreader/releases.atom", limit = 3, download_full_article=false, include_images=false, enable_filter=true, filter_element = "div.release-main-section"},
|
||||
{ "https://ourworldindata.org/atom.xml", limit = 5 , download_full_article=true, include_images=true, enable_filter=false, filter_element = ""},
|
||||
{ "https://github.com/koreader/koreader/releases.atom", limit = 3, download_full_article=false, include_images=false, enable_filter=true, filter_element = "div.release-main-section", block_element = ""},
|
||||
{ "https://ourworldindata.org/atom.xml", limit = 5 , download_full_article=true, include_images=true, enable_filter=false, filter_element = "", block_element = ""},
|
||||
|
||||
}--do NOT change this line
|
||||
|
||||
@@ -8,6 +8,7 @@ local FeedView = {
|
||||
INCLUDE_IMAGES = "include_images",
|
||||
ENABLE_FILTER = "enable_filter",
|
||||
FILTER_ELEMENT = "filter_element",
|
||||
BLOCK_ELEMENT = "block_element",
|
||||
-- HTTP Basic Auth (optional)
|
||||
HTTP_AUTH_USERNAME = "http_auth_username",
|
||||
HTTP_AUTH_PASSWORD = "http_auth_password",
|
||||
@@ -70,6 +71,7 @@ function FeedView:getItem(id, feed, edit_feed_callback, delete_feed_callback)
|
||||
local include_images = feed.include_images ~= false
|
||||
local enable_filter = feed.enable_filter ~= false
|
||||
local filter_element = feed.filter_element
|
||||
local block_element = feed.block_element
|
||||
local http_auth = feed.http_auth or { username = nil, password = nil }
|
||||
local http_auth_username = http_auth.username
|
||||
local http_auth_password_set = type(http_auth.password) == "string" and #http_auth.password > 0
|
||||
@@ -142,6 +144,17 @@ function FeedView:getItem(id, feed, edit_feed_callback, delete_feed_callback)
|
||||
)
|
||||
end
|
||||
},
|
||||
{
|
||||
_("Block element"),
|
||||
block_element,
|
||||
callback = function()
|
||||
edit_feed_callback(
|
||||
id,
|
||||
FeedView.BLOCK_ELEMENT,
|
||||
block_element
|
||||
)
|
||||
end
|
||||
},
|
||||
--- HTTP Basic auth fields (optional)
|
||||
"---",
|
||||
{
|
||||
|
||||
@@ -61,6 +61,7 @@ local function getEmptyFeed()
|
||||
include_images = true,
|
||||
enable_filter = false,
|
||||
filter_element = "",
|
||||
block_element = "",
|
||||
http_auth = { username = nil, password = nil },
|
||||
}
|
||||
end
|
||||
@@ -301,6 +302,7 @@ function NewsDownloader:loadConfigAndProcessFeeds(touchmenu_instance)
|
||||
local include_images = not never_download_images and feed.include_images
|
||||
local enable_filter = feed.enable_filter or feed.enable_filter == nil
|
||||
local filter_element = feed.filter_element or feed.filter_element == nil
|
||||
local block_element = feed.block_element or feed.block_element == nil
|
||||
local credentials = feed.credentials
|
||||
local http_auth = feed.http_auth
|
||||
-- Check if the two required attributes are set.
|
||||
@@ -318,7 +320,8 @@ function NewsDownloader:loadConfigAndProcessFeeds(touchmenu_instance)
|
||||
include_images,
|
||||
feed_message,
|
||||
enable_filter,
|
||||
filter_element)
|
||||
filter_element,
|
||||
block_element)
|
||||
else
|
||||
logger.warn("NewsDownloader: invalid feed config entry.", feed)
|
||||
end
|
||||
@@ -390,7 +393,7 @@ function NewsDownloader:loadConfigAndProcessFeedsWithUI(touchmenu_instance)
|
||||
end)
|
||||
end
|
||||
|
||||
function NewsDownloader:processFeedSource(url, credentials, http_auth, limit, unsupported_feeds_urls, download_full_article, include_images, message, enable_filter, filter_element)
|
||||
function NewsDownloader:processFeedSource(url, credentials, http_auth, limit, unsupported_feeds_urls, download_full_article, include_images, message, enable_filter, filter_element, block_element)
|
||||
-- Check if we have a cached response first
|
||||
local cache = DownloadBackend:getCache()
|
||||
local cached_response = cache:check(url)
|
||||
@@ -552,7 +555,8 @@ function NewsDownloader:processFeedSource(url, credentials, http_auth, limit, un
|
||||
include_images,
|
||||
message,
|
||||
enable_filter,
|
||||
filter_element
|
||||
filter_element,
|
||||
block_element
|
||||
)
|
||||
end)
|
||||
elseif is_rss then
|
||||
@@ -567,7 +571,8 @@ function NewsDownloader:processFeedSource(url, credentials, http_auth, limit, un
|
||||
include_images,
|
||||
message,
|
||||
enable_filter,
|
||||
filter_element
|
||||
filter_element,
|
||||
block_element
|
||||
)
|
||||
end)
|
||||
end
|
||||
@@ -612,7 +617,7 @@ function NewsDownloader:deserializeXMLString(xml_str)
|
||||
return xmlhandler.root
|
||||
end
|
||||
|
||||
function NewsDownloader:processFeed(feed_type, feeds, cookies, http_auth, limit, download_full_article, include_images, message, enable_filter, filter_element)
|
||||
function NewsDownloader:processFeed(feed_type, feeds, cookies, http_auth, limit, download_full_article, include_images, message, enable_filter, filter_element, block_element)
|
||||
local feed_title
|
||||
local feed_item
|
||||
local total_items
|
||||
@@ -685,7 +690,8 @@ function NewsDownloader:processFeed(feed_type, feeds, cookies, http_auth, limit,
|
||||
include_images,
|
||||
article_message,
|
||||
enable_filter,
|
||||
filter_element
|
||||
filter_element,
|
||||
block_element
|
||||
)
|
||||
else
|
||||
self:createFromDescription(
|
||||
@@ -724,7 +730,7 @@ local function getTitleWithDate(feed)
|
||||
return title
|
||||
end
|
||||
|
||||
function NewsDownloader:downloadFeed(feed, cookies, http_auth, feed_output_dir, include_images, message, enable_filter, filter_element)
|
||||
function NewsDownloader:downloadFeed(feed, cookies, http_auth, feed_output_dir, include_images, message, enable_filter, filter_element, block_element)
|
||||
local title_with_date = getTitleWithDate(feed)
|
||||
local news_file_path = ("%s%s%s"):format(feed_output_dir,
|
||||
title_with_date,
|
||||
@@ -742,7 +748,7 @@ function NewsDownloader:downloadFeed(feed, cookies, http_auth, feed_output_dir,
|
||||
extra_headers = { ["Authorization"] = "Basic " .. mime.b64((http_auth.username or "") .. ":" .. (http_auth.password or "")) }
|
||||
end
|
||||
local html = DownloadBackend:loadPage(link, cookies, extra_headers)
|
||||
DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message, enable_filter, filter_element)
|
||||
DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message, enable_filter, filter_element, block_element)
|
||||
end
|
||||
end
|
||||
|
||||
@@ -929,6 +935,7 @@ function NewsDownloader:editFeedAttribute(id, key, value)
|
||||
if key == FeedView.URL
|
||||
or key == FeedView.LIMIT
|
||||
or key == FeedView.FILTER_ELEMENT
|
||||
or key == FeedView.BLOCK_ELEMENT
|
||||
or key == FeedView.HTTP_AUTH_USERNAME
|
||||
or key == FeedView.HTTP_AUTH_PASSWORD then
|
||||
|
||||
@@ -947,6 +954,10 @@ function NewsDownloader:editFeedAttribute(id, key, value)
|
||||
title = _("Edit filter element.")
|
||||
description = _("Filter based on the given CSS selector. E.g.: name_of_css.element.class")
|
||||
input_type = "string"
|
||||
elseif key == FeedView.BLOCK_ELEMENT then
|
||||
title = _("Edit block element.")
|
||||
description = _("Block element based on the given CSS selector. E.g.: name_of_css.element.class")
|
||||
input_type = "string"
|
||||
elseif key == FeedView.HTTP_AUTH_USERNAME then
|
||||
title = _("HTTP auth username")
|
||||
input_type = "string"
|
||||
@@ -1139,6 +1150,18 @@ function NewsDownloader:updateFeedConfig(id, key, value)
|
||||
}
|
||||
)
|
||||
end
|
||||
elseif key == FeedView.BLOCK_ELEMENT then
|
||||
if feed.block_element then
|
||||
feed.block_element = value
|
||||
else
|
||||
table.insert(
|
||||
feed,
|
||||
{
|
||||
"block_element",
|
||||
value
|
||||
}
|
||||
)
|
||||
end
|
||||
elseif key == FeedView.HTTP_AUTH_USERNAME then
|
||||
feed.http_auth = feed.http_auth or { username = "", password = "" }
|
||||
feed.http_auth.username = value or ""
|
||||
|
||||
Reference in New Issue
Block a user