[plugin] NewsDownloader: block CSS elements (#14383)

This commit is contained in:
nuxa17
2025-09-30 22:07:35 +02:00
committed by GitHub
parent 87a5116a24
commit 2ddbe04715
4 changed files with 126 additions and 32 deletions

View File

@@ -13,6 +13,19 @@ local time = require("ui/time")
local _ = require("gettext")
local T = ffiutil.template
local function removeSubstring(str, substr)
local iter = 1
local i, j
repeat
i, j = string.find(str, substr, iter, true)
if i then
str = string.sub(str, 1, i-1) .. string.sub(str, j+1, -1)
iter = i
end
until not i
return str
end
local EpubDownloadBackend = {
-- Can be set so HTTP requests will be done under Trapper and
-- be interruptible
@@ -29,11 +42,27 @@ local FeedCache = CacheSQLite:new{
size = 1024 * 1024 * 10, -- 10MB
}
-- Get HTML elements sorted by level.
local function selectSorted(root, elements)
local elements_sorted = {}
for _, sel in ipairs(elements) do
local selected = root:select(sel)
if selected then
for _, e in ipairs(selected) do
table.insert(elements_sorted, e)
end
end
end
table.sort(elements_sorted, function(a, b) return a.level < b.level end)
return elements_sorted
end
-- filter HTML using CSS selector
local function filter(text, element)
local function selectAndCleanHTML(text, filter_element, block_element)
local htmlparser = require("htmlparser")
local root = htmlparser.parse(text, 5000)
local filtered = nil
local filtered_e = nil
local selectors = {
"main",
"article",
@@ -54,33 +83,59 @@ local function filter(text, element)
"div#newsstorytext",
"div.general",
}
if type(element) == "string" and element ~= "" then
table.insert(selectors, 1, element) -- Insert string at the beginning
elseif type(element) == "table" then
for _, el in ipairs(element) do
local annoyances = {
"div.article__social",
"figure.is-type-video",
}
if type(filter_element) == "string" and filter_element ~= "" then
table.insert(selectors, 1, filter_element) -- Insert string at the beginning
elseif type(filter_element) == "table" then
for _, el in ipairs(filter_element) do
if type(el) == "string" and el ~= "" then
table.insert(selectors, 1, el) -- Insert each non-empty element at the beginning
end
end
end
for _, sel in ipairs(selectors) do
local elements = root:select(sel)
if elements then
for _, e in ipairs(elements) do
filtered = e:getcontent()
if filtered then
local elements = root:select(sel)
if elements then
for _, e in ipairs(elements) do
if e:getcontent() then
filtered_e = e
break
end
end
if filtered then
break
end
end
end
end
if filtered then
break
end
end
end
if not filtered then
return text
if not filtered_e then
filtered_e = root
filtered = text
else
filtered = "<!DOCTYPE html><html><head></head><body>" .. filtered_e:getcontent() .. "</body></html>"
end
return "<!DOCTYPE html><html><head></head><body>" .. filtered .. "</body></html>"
-- blocking
if type(block_element) == "string" and block_element ~= "" then
annoyances = { block_element }
elseif type(block_element) == "table" then
local custom_annoyances = {}
for _, el in ipairs(block_element) do
if type(el) == "string" and el ~= "" then
table.insert(custom_annoyances, 1, el)
end
end
if next(custom_annoyances) then -- there is at least one valid component
annoyances = custom_annoyances
end
end
-- Removing deeper elements may modify text inside others, making patterns not match.
annoyances = selectSorted(filtered_e, annoyances)
for _, e in ipairs(annoyances) do
filtered = removeSubstring(filtered, e:gettext())
end
return filtered
end
-- From https://github.com/lunarmodules/luasocket/blob/1fad1626900a128be724cba9e9c19a6b2fe2bf6b/samples/cookie.lua
@@ -321,7 +376,7 @@ local ext_to_mimetype = {
woff = "application/font-woff",
}
-- Create an epub file (with possibly images)
function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message, filter_enable, filter_element)
function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message, filter_enable, filter_element, block_element)
logger.dbg("EpubDownloadBackend:createEpub(", epub_path, ")")
-- Use Trapper to display progress and ask questions through the UI.
-- We need to have been Trapper.wrap()'ed for UI to be used, otherwise
@@ -343,7 +398,7 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
-- should it changes if content is updated (as now, including the wikipedia revisionId),
-- or should it stays the same even if revid changes (content of the same book updated).
if filter_enable then html = filter(html, filter_element) end
if filter_enable then html = selectAndCleanHTML(html, filter_element, block_element) end
local images = {}
local seen_images = {}
local imagenum = 1

View File

@@ -21,13 +21,16 @@ return {--do NOT change this line
-- 'include_images=false' - means ignore any images, only download the text (faster download, smaller file sizes)
-- default value is 'false' (if no 'include_images' entry)
-- 'enable_filter=true' - means filter using a CSS selector to delimit part of the page to just that (does not apply if download_full_article=false)
-- 'enable_filter=true' - means filter using a CSS selector to delimit part of the page to just that and removes broken/annoying elements (does not apply if download_full_article=false)
-- 'enable_filter=false' - means no such filtering and including the full page
-- default value is 'false'
-- 'filter_element="name_of_css.element.class" - means to filter the chosen CSS selector, it can be easily picked using a modern web browser
-- The default value is empty. The default list of common selectors is used as fallback if this value is set.
-- 'block_element="name_of_css.element.class" - means to remove the chosen CSS element, it can be easily picked using a modern web browser
-- The default value is empty. The default list of common annoyances is used as fallback if this value is set.
-- Optional 'credentials' element is used to authenticate on subscription based articles.
-- It is itself comprised of a 'url' strings, that is the url of the connexion form,
-- and an 'auth' table that contains form data used for user authentication {form_key = value, …}.
@@ -38,7 +41,7 @@ return {--do NOT change this line
-- LIST YOUR FEEDS HERE:
{ "https://github.com/koreader/koreader/releases.atom", limit = 3, download_full_article=false, include_images=false, enable_filter=true, filter_element = "div.release-main-section"},
{ "https://ourworldindata.org/atom.xml", limit = 5 , download_full_article=true, include_images=true, enable_filter=false, filter_element = ""},
{ "https://github.com/koreader/koreader/releases.atom", limit = 3, download_full_article=false, include_images=false, enable_filter=true, filter_element = "div.release-main-section", block_element = ""},
{ "https://ourworldindata.org/atom.xml", limit = 5 , download_full_article=true, include_images=true, enable_filter=false, filter_element = "", block_element = ""},
}--do NOT change this line

View File

@@ -8,6 +8,7 @@ local FeedView = {
INCLUDE_IMAGES = "include_images",
ENABLE_FILTER = "enable_filter",
FILTER_ELEMENT = "filter_element",
BLOCK_ELEMENT = "block_element",
-- HTTP Basic Auth (optional)
HTTP_AUTH_USERNAME = "http_auth_username",
HTTP_AUTH_PASSWORD = "http_auth_password",
@@ -70,6 +71,7 @@ function FeedView:getItem(id, feed, edit_feed_callback, delete_feed_callback)
local include_images = feed.include_images ~= false
local enable_filter = feed.enable_filter ~= false
local filter_element = feed.filter_element
local block_element = feed.block_element
local http_auth = feed.http_auth or { username = nil, password = nil }
local http_auth_username = http_auth.username
local http_auth_password_set = type(http_auth.password) == "string" and #http_auth.password > 0
@@ -142,6 +144,17 @@ function FeedView:getItem(id, feed, edit_feed_callback, delete_feed_callback)
)
end
},
{
_("Block element"),
block_element,
callback = function()
edit_feed_callback(
id,
FeedView.BLOCK_ELEMENT,
block_element
)
end
},
--- HTTP Basic auth fields (optional)
"---",
{

View File

@@ -61,6 +61,7 @@ local function getEmptyFeed()
include_images = true,
enable_filter = false,
filter_element = "",
block_element = "",
http_auth = { username = nil, password = nil },
}
end
@@ -301,6 +302,7 @@ function NewsDownloader:loadConfigAndProcessFeeds(touchmenu_instance)
local include_images = not never_download_images and feed.include_images
local enable_filter = feed.enable_filter or feed.enable_filter == nil
local filter_element = feed.filter_element or feed.filter_element == nil
local block_element = feed.block_element or feed.block_element == nil
local credentials = feed.credentials
local http_auth = feed.http_auth
-- Check if the two required attributes are set.
@@ -318,7 +320,8 @@ function NewsDownloader:loadConfigAndProcessFeeds(touchmenu_instance)
include_images,
feed_message,
enable_filter,
filter_element)
filter_element,
block_element)
else
logger.warn("NewsDownloader: invalid feed config entry.", feed)
end
@@ -390,7 +393,7 @@ function NewsDownloader:loadConfigAndProcessFeedsWithUI(touchmenu_instance)
end)
end
function NewsDownloader:processFeedSource(url, credentials, http_auth, limit, unsupported_feeds_urls, download_full_article, include_images, message, enable_filter, filter_element)
function NewsDownloader:processFeedSource(url, credentials, http_auth, limit, unsupported_feeds_urls, download_full_article, include_images, message, enable_filter, filter_element, block_element)
-- Check if we have a cached response first
local cache = DownloadBackend:getCache()
local cached_response = cache:check(url)
@@ -552,7 +555,8 @@ function NewsDownloader:processFeedSource(url, credentials, http_auth, limit, un
include_images,
message,
enable_filter,
filter_element
filter_element,
block_element
)
end)
elseif is_rss then
@@ -567,7 +571,8 @@ function NewsDownloader:processFeedSource(url, credentials, http_auth, limit, un
include_images,
message,
enable_filter,
filter_element
filter_element,
block_element
)
end)
end
@@ -612,7 +617,7 @@ function NewsDownloader:deserializeXMLString(xml_str)
return xmlhandler.root
end
function NewsDownloader:processFeed(feed_type, feeds, cookies, http_auth, limit, download_full_article, include_images, message, enable_filter, filter_element)
function NewsDownloader:processFeed(feed_type, feeds, cookies, http_auth, limit, download_full_article, include_images, message, enable_filter, filter_element, block_element)
local feed_title
local feed_item
local total_items
@@ -685,7 +690,8 @@ function NewsDownloader:processFeed(feed_type, feeds, cookies, http_auth, limit,
include_images,
article_message,
enable_filter,
filter_element
filter_element,
block_element
)
else
self:createFromDescription(
@@ -724,7 +730,7 @@ local function getTitleWithDate(feed)
return title
end
function NewsDownloader:downloadFeed(feed, cookies, http_auth, feed_output_dir, include_images, message, enable_filter, filter_element)
function NewsDownloader:downloadFeed(feed, cookies, http_auth, feed_output_dir, include_images, message, enable_filter, filter_element, block_element)
local title_with_date = getTitleWithDate(feed)
local news_file_path = ("%s%s%s"):format(feed_output_dir,
title_with_date,
@@ -742,7 +748,7 @@ function NewsDownloader:downloadFeed(feed, cookies, http_auth, feed_output_dir,
extra_headers = { ["Authorization"] = "Basic " .. mime.b64((http_auth.username or "") .. ":" .. (http_auth.password or "")) }
end
local html = DownloadBackend:loadPage(link, cookies, extra_headers)
DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message, enable_filter, filter_element)
DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message, enable_filter, filter_element, block_element)
end
end
@@ -929,6 +935,7 @@ function NewsDownloader:editFeedAttribute(id, key, value)
if key == FeedView.URL
or key == FeedView.LIMIT
or key == FeedView.FILTER_ELEMENT
or key == FeedView.BLOCK_ELEMENT
or key == FeedView.HTTP_AUTH_USERNAME
or key == FeedView.HTTP_AUTH_PASSWORD then
@@ -947,6 +954,10 @@ function NewsDownloader:editFeedAttribute(id, key, value)
title = _("Edit filter element.")
description = _("Filter based on the given CSS selector. E.g.: name_of_css.element.class")
input_type = "string"
elseif key == FeedView.BLOCK_ELEMENT then
title = _("Edit block element.")
description = _("Block element based on the given CSS selector. E.g.: name_of_css.element.class")
input_type = "string"
elseif key == FeedView.HTTP_AUTH_USERNAME then
title = _("HTTP auth username")
input_type = "string"
@@ -1139,6 +1150,18 @@ function NewsDownloader:updateFeedConfig(id, key, value)
}
)
end
elseif key == FeedView.BLOCK_ELEMENT then
if feed.block_element then
feed.block_element = value
else
table.insert(
feed,
{
"block_element",
value
}
)
end
elseif key == FeedView.HTTP_AUTH_USERNAME then
feed.http_auth = feed.http_auth or { username = "", password = "" }
feed.http_auth.username = value or ""