mirror of
https://github.com/koreader/koreader.git
synced 2025-12-13 20:36:53 +01:00
[plugin] NewsDownloader: block CSS elements (#14383)
This commit is contained in:
@@ -13,6 +13,19 @@ local time = require("ui/time")
|
|||||||
local _ = require("gettext")
|
local _ = require("gettext")
|
||||||
local T = ffiutil.template
|
local T = ffiutil.template
|
||||||
|
|
||||||
|
local function removeSubstring(str, substr)
|
||||||
|
local iter = 1
|
||||||
|
local i, j
|
||||||
|
repeat
|
||||||
|
i, j = string.find(str, substr, iter, true)
|
||||||
|
if i then
|
||||||
|
str = string.sub(str, 1, i-1) .. string.sub(str, j+1, -1)
|
||||||
|
iter = i
|
||||||
|
end
|
||||||
|
until not i
|
||||||
|
return str
|
||||||
|
end
|
||||||
|
|
||||||
local EpubDownloadBackend = {
|
local EpubDownloadBackend = {
|
||||||
-- Can be set so HTTP requests will be done under Trapper and
|
-- Can be set so HTTP requests will be done under Trapper and
|
||||||
-- be interruptible
|
-- be interruptible
|
||||||
@@ -29,11 +42,27 @@ local FeedCache = CacheSQLite:new{
|
|||||||
size = 1024 * 1024 * 10, -- 10MB
|
size = 1024 * 1024 * 10, -- 10MB
|
||||||
}
|
}
|
||||||
|
|
||||||
|
-- Get HTML elements sorted by level.
|
||||||
|
local function selectSorted(root, elements)
|
||||||
|
local elements_sorted = {}
|
||||||
|
for _, sel in ipairs(elements) do
|
||||||
|
local selected = root:select(sel)
|
||||||
|
if selected then
|
||||||
|
for _, e in ipairs(selected) do
|
||||||
|
table.insert(elements_sorted, e)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
table.sort(elements_sorted, function(a, b) return a.level < b.level end)
|
||||||
|
return elements_sorted
|
||||||
|
end
|
||||||
|
|
||||||
-- filter HTML using CSS selector
|
-- filter HTML using CSS selector
|
||||||
local function filter(text, element)
|
local function selectAndCleanHTML(text, filter_element, block_element)
|
||||||
local htmlparser = require("htmlparser")
|
local htmlparser = require("htmlparser")
|
||||||
local root = htmlparser.parse(text, 5000)
|
local root = htmlparser.parse(text, 5000)
|
||||||
local filtered = nil
|
local filtered = nil
|
||||||
|
local filtered_e = nil
|
||||||
local selectors = {
|
local selectors = {
|
||||||
"main",
|
"main",
|
||||||
"article",
|
"article",
|
||||||
@@ -54,33 +83,59 @@ local function filter(text, element)
|
|||||||
"div#newsstorytext",
|
"div#newsstorytext",
|
||||||
"div.general",
|
"div.general",
|
||||||
}
|
}
|
||||||
if type(element) == "string" and element ~= "" then
|
local annoyances = {
|
||||||
table.insert(selectors, 1, element) -- Insert string at the beginning
|
"div.article__social",
|
||||||
elseif type(element) == "table" then
|
"figure.is-type-video",
|
||||||
for _, el in ipairs(element) do
|
}
|
||||||
|
if type(filter_element) == "string" and filter_element ~= "" then
|
||||||
|
table.insert(selectors, 1, filter_element) -- Insert string at the beginning
|
||||||
|
elseif type(filter_element) == "table" then
|
||||||
|
for _, el in ipairs(filter_element) do
|
||||||
if type(el) == "string" and el ~= "" then
|
if type(el) == "string" and el ~= "" then
|
||||||
table.insert(selectors, 1, el) -- Insert each non-empty element at the beginning
|
table.insert(selectors, 1, el) -- Insert each non-empty element at the beginning
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
for _, sel in ipairs(selectors) do
|
for _, sel in ipairs(selectors) do
|
||||||
local elements = root:select(sel)
|
local elements = root:select(sel)
|
||||||
if elements then
|
if elements then
|
||||||
for _, e in ipairs(elements) do
|
for _, e in ipairs(elements) do
|
||||||
filtered = e:getcontent()
|
if e:getcontent() then
|
||||||
if filtered then
|
filtered_e = e
|
||||||
break
|
break
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
if filtered then
|
if filtered then
|
||||||
break
|
break
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
if not filtered then
|
if not filtered_e then
|
||||||
return text
|
filtered_e = root
|
||||||
|
filtered = text
|
||||||
|
else
|
||||||
|
filtered = "<!DOCTYPE html><html><head></head><body>" .. filtered_e:getcontent() .. "</body></html>"
|
||||||
end
|
end
|
||||||
return "<!DOCTYPE html><html><head></head><body>" .. filtered .. "</body></html>"
|
-- blocking
|
||||||
|
if type(block_element) == "string" and block_element ~= "" then
|
||||||
|
annoyances = { block_element }
|
||||||
|
elseif type(block_element) == "table" then
|
||||||
|
local custom_annoyances = {}
|
||||||
|
for _, el in ipairs(block_element) do
|
||||||
|
if type(el) == "string" and el ~= "" then
|
||||||
|
table.insert(custom_annoyances, 1, el)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
if next(custom_annoyances) then -- there is at least one valid component
|
||||||
|
annoyances = custom_annoyances
|
||||||
|
end
|
||||||
|
end
|
||||||
|
-- Removing deeper elements may modify text inside others, making patterns not match.
|
||||||
|
annoyances = selectSorted(filtered_e, annoyances)
|
||||||
|
for _, e in ipairs(annoyances) do
|
||||||
|
filtered = removeSubstring(filtered, e:gettext())
|
||||||
|
end
|
||||||
|
return filtered
|
||||||
end
|
end
|
||||||
|
|
||||||
-- From https://github.com/lunarmodules/luasocket/blob/1fad1626900a128be724cba9e9c19a6b2fe2bf6b/samples/cookie.lua
|
-- From https://github.com/lunarmodules/luasocket/blob/1fad1626900a128be724cba9e9c19a6b2fe2bf6b/samples/cookie.lua
|
||||||
@@ -321,7 +376,7 @@ local ext_to_mimetype = {
|
|||||||
woff = "application/font-woff",
|
woff = "application/font-woff",
|
||||||
}
|
}
|
||||||
-- Create an epub file (with possibly images)
|
-- Create an epub file (with possibly images)
|
||||||
function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message, filter_enable, filter_element)
|
function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message, filter_enable, filter_element, block_element)
|
||||||
logger.dbg("EpubDownloadBackend:createEpub(", epub_path, ")")
|
logger.dbg("EpubDownloadBackend:createEpub(", epub_path, ")")
|
||||||
-- Use Trapper to display progress and ask questions through the UI.
|
-- Use Trapper to display progress and ask questions through the UI.
|
||||||
-- We need to have been Trapper.wrap()'ed for UI to be used, otherwise
|
-- We need to have been Trapper.wrap()'ed for UI to be used, otherwise
|
||||||
@@ -343,7 +398,7 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me
|
|||||||
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
|
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
|
||||||
-- should it changes if content is updated (as now, including the wikipedia revisionId),
|
-- should it changes if content is updated (as now, including the wikipedia revisionId),
|
||||||
-- or should it stays the same even if revid changes (content of the same book updated).
|
-- or should it stays the same even if revid changes (content of the same book updated).
|
||||||
if filter_enable then html = filter(html, filter_element) end
|
if filter_enable then html = selectAndCleanHTML(html, filter_element, block_element) end
|
||||||
local images = {}
|
local images = {}
|
||||||
local seen_images = {}
|
local seen_images = {}
|
||||||
local imagenum = 1
|
local imagenum = 1
|
||||||
|
|||||||
@@ -21,13 +21,16 @@ return {--do NOT change this line
|
|||||||
-- 'include_images=false' - means ignore any images, only download the text (faster download, smaller file sizes)
|
-- 'include_images=false' - means ignore any images, only download the text (faster download, smaller file sizes)
|
||||||
-- default value is 'false' (if no 'include_images' entry)
|
-- default value is 'false' (if no 'include_images' entry)
|
||||||
|
|
||||||
-- 'enable_filter=true' - means filter using a CSS selector to delimit part of the page to just that (does not apply if download_full_article=false)
|
-- 'enable_filter=true' - means filter using a CSS selector to delimit part of the page to just that and removes broken/annoying elements (does not apply if download_full_article=false)
|
||||||
-- 'enable_filter=false' - means no such filtering and including the full page
|
-- 'enable_filter=false' - means no such filtering and including the full page
|
||||||
-- default value is 'false'
|
-- default value is 'false'
|
||||||
|
|
||||||
-- 'filter_element="name_of_css.element.class" - means to filter the chosen CSS selector, it can be easily picked using a modern web browser
|
-- 'filter_element="name_of_css.element.class" - means to filter the chosen CSS selector, it can be easily picked using a modern web browser
|
||||||
-- The default value is empty. The default list of common selectors is used as fallback if this value is set.
|
-- The default value is empty. The default list of common selectors is used as fallback if this value is set.
|
||||||
|
|
||||||
|
-- 'block_element="name_of_css.element.class" - means to remove the chosen CSS element, it can be easily picked using a modern web browser
|
||||||
|
-- The default value is empty. The default list of common annoyances is used as fallback if this value is set.
|
||||||
|
|
||||||
-- Optional 'credentials' element is used to authenticate on subscription based articles.
|
-- Optional 'credentials' element is used to authenticate on subscription based articles.
|
||||||
-- It is itself comprised of a 'url' strings, that is the url of the connexion form,
|
-- It is itself comprised of a 'url' strings, that is the url of the connexion form,
|
||||||
-- and an 'auth' table that contains form data used for user authentication {form_key = value, …}.
|
-- and an 'auth' table that contains form data used for user authentication {form_key = value, …}.
|
||||||
@@ -38,7 +41,7 @@ return {--do NOT change this line
|
|||||||
|
|
||||||
-- LIST YOUR FEEDS HERE:
|
-- LIST YOUR FEEDS HERE:
|
||||||
|
|
||||||
{ "https://github.com/koreader/koreader/releases.atom", limit = 3, download_full_article=false, include_images=false, enable_filter=true, filter_element = "div.release-main-section"},
|
{ "https://github.com/koreader/koreader/releases.atom", limit = 3, download_full_article=false, include_images=false, enable_filter=true, filter_element = "div.release-main-section", block_element = ""},
|
||||||
{ "https://ourworldindata.org/atom.xml", limit = 5 , download_full_article=true, include_images=true, enable_filter=false, filter_element = ""},
|
{ "https://ourworldindata.org/atom.xml", limit = 5 , download_full_article=true, include_images=true, enable_filter=false, filter_element = "", block_element = ""},
|
||||||
|
|
||||||
}--do NOT change this line
|
}--do NOT change this line
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ local FeedView = {
|
|||||||
INCLUDE_IMAGES = "include_images",
|
INCLUDE_IMAGES = "include_images",
|
||||||
ENABLE_FILTER = "enable_filter",
|
ENABLE_FILTER = "enable_filter",
|
||||||
FILTER_ELEMENT = "filter_element",
|
FILTER_ELEMENT = "filter_element",
|
||||||
|
BLOCK_ELEMENT = "block_element",
|
||||||
-- HTTP Basic Auth (optional)
|
-- HTTP Basic Auth (optional)
|
||||||
HTTP_AUTH_USERNAME = "http_auth_username",
|
HTTP_AUTH_USERNAME = "http_auth_username",
|
||||||
HTTP_AUTH_PASSWORD = "http_auth_password",
|
HTTP_AUTH_PASSWORD = "http_auth_password",
|
||||||
@@ -70,6 +71,7 @@ function FeedView:getItem(id, feed, edit_feed_callback, delete_feed_callback)
|
|||||||
local include_images = feed.include_images ~= false
|
local include_images = feed.include_images ~= false
|
||||||
local enable_filter = feed.enable_filter ~= false
|
local enable_filter = feed.enable_filter ~= false
|
||||||
local filter_element = feed.filter_element
|
local filter_element = feed.filter_element
|
||||||
|
local block_element = feed.block_element
|
||||||
local http_auth = feed.http_auth or { username = nil, password = nil }
|
local http_auth = feed.http_auth or { username = nil, password = nil }
|
||||||
local http_auth_username = http_auth.username
|
local http_auth_username = http_auth.username
|
||||||
local http_auth_password_set = type(http_auth.password) == "string" and #http_auth.password > 0
|
local http_auth_password_set = type(http_auth.password) == "string" and #http_auth.password > 0
|
||||||
@@ -142,6 +144,17 @@ function FeedView:getItem(id, feed, edit_feed_callback, delete_feed_callback)
|
|||||||
)
|
)
|
||||||
end
|
end
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
_("Block element"),
|
||||||
|
block_element,
|
||||||
|
callback = function()
|
||||||
|
edit_feed_callback(
|
||||||
|
id,
|
||||||
|
FeedView.BLOCK_ELEMENT,
|
||||||
|
block_element
|
||||||
|
)
|
||||||
|
end
|
||||||
|
},
|
||||||
--- HTTP Basic auth fields (optional)
|
--- HTTP Basic auth fields (optional)
|
||||||
"---",
|
"---",
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -61,6 +61,7 @@ local function getEmptyFeed()
|
|||||||
include_images = true,
|
include_images = true,
|
||||||
enable_filter = false,
|
enable_filter = false,
|
||||||
filter_element = "",
|
filter_element = "",
|
||||||
|
block_element = "",
|
||||||
http_auth = { username = nil, password = nil },
|
http_auth = { username = nil, password = nil },
|
||||||
}
|
}
|
||||||
end
|
end
|
||||||
@@ -301,6 +302,7 @@ function NewsDownloader:loadConfigAndProcessFeeds(touchmenu_instance)
|
|||||||
local include_images = not never_download_images and feed.include_images
|
local include_images = not never_download_images and feed.include_images
|
||||||
local enable_filter = feed.enable_filter or feed.enable_filter == nil
|
local enable_filter = feed.enable_filter or feed.enable_filter == nil
|
||||||
local filter_element = feed.filter_element or feed.filter_element == nil
|
local filter_element = feed.filter_element or feed.filter_element == nil
|
||||||
|
local block_element = feed.block_element or feed.block_element == nil
|
||||||
local credentials = feed.credentials
|
local credentials = feed.credentials
|
||||||
local http_auth = feed.http_auth
|
local http_auth = feed.http_auth
|
||||||
-- Check if the two required attributes are set.
|
-- Check if the two required attributes are set.
|
||||||
@@ -318,7 +320,8 @@ function NewsDownloader:loadConfigAndProcessFeeds(touchmenu_instance)
|
|||||||
include_images,
|
include_images,
|
||||||
feed_message,
|
feed_message,
|
||||||
enable_filter,
|
enable_filter,
|
||||||
filter_element)
|
filter_element,
|
||||||
|
block_element)
|
||||||
else
|
else
|
||||||
logger.warn("NewsDownloader: invalid feed config entry.", feed)
|
logger.warn("NewsDownloader: invalid feed config entry.", feed)
|
||||||
end
|
end
|
||||||
@@ -390,7 +393,7 @@ function NewsDownloader:loadConfigAndProcessFeedsWithUI(touchmenu_instance)
|
|||||||
end)
|
end)
|
||||||
end
|
end
|
||||||
|
|
||||||
function NewsDownloader:processFeedSource(url, credentials, http_auth, limit, unsupported_feeds_urls, download_full_article, include_images, message, enable_filter, filter_element)
|
function NewsDownloader:processFeedSource(url, credentials, http_auth, limit, unsupported_feeds_urls, download_full_article, include_images, message, enable_filter, filter_element, block_element)
|
||||||
-- Check if we have a cached response first
|
-- Check if we have a cached response first
|
||||||
local cache = DownloadBackend:getCache()
|
local cache = DownloadBackend:getCache()
|
||||||
local cached_response = cache:check(url)
|
local cached_response = cache:check(url)
|
||||||
@@ -552,7 +555,8 @@ function NewsDownloader:processFeedSource(url, credentials, http_auth, limit, un
|
|||||||
include_images,
|
include_images,
|
||||||
message,
|
message,
|
||||||
enable_filter,
|
enable_filter,
|
||||||
filter_element
|
filter_element,
|
||||||
|
block_element
|
||||||
)
|
)
|
||||||
end)
|
end)
|
||||||
elseif is_rss then
|
elseif is_rss then
|
||||||
@@ -567,7 +571,8 @@ function NewsDownloader:processFeedSource(url, credentials, http_auth, limit, un
|
|||||||
include_images,
|
include_images,
|
||||||
message,
|
message,
|
||||||
enable_filter,
|
enable_filter,
|
||||||
filter_element
|
filter_element,
|
||||||
|
block_element
|
||||||
)
|
)
|
||||||
end)
|
end)
|
||||||
end
|
end
|
||||||
@@ -612,7 +617,7 @@ function NewsDownloader:deserializeXMLString(xml_str)
|
|||||||
return xmlhandler.root
|
return xmlhandler.root
|
||||||
end
|
end
|
||||||
|
|
||||||
function NewsDownloader:processFeed(feed_type, feeds, cookies, http_auth, limit, download_full_article, include_images, message, enable_filter, filter_element)
|
function NewsDownloader:processFeed(feed_type, feeds, cookies, http_auth, limit, download_full_article, include_images, message, enable_filter, filter_element, block_element)
|
||||||
local feed_title
|
local feed_title
|
||||||
local feed_item
|
local feed_item
|
||||||
local total_items
|
local total_items
|
||||||
@@ -685,7 +690,8 @@ function NewsDownloader:processFeed(feed_type, feeds, cookies, http_auth, limit,
|
|||||||
include_images,
|
include_images,
|
||||||
article_message,
|
article_message,
|
||||||
enable_filter,
|
enable_filter,
|
||||||
filter_element
|
filter_element,
|
||||||
|
block_element
|
||||||
)
|
)
|
||||||
else
|
else
|
||||||
self:createFromDescription(
|
self:createFromDescription(
|
||||||
@@ -724,7 +730,7 @@ local function getTitleWithDate(feed)
|
|||||||
return title
|
return title
|
||||||
end
|
end
|
||||||
|
|
||||||
function NewsDownloader:downloadFeed(feed, cookies, http_auth, feed_output_dir, include_images, message, enable_filter, filter_element)
|
function NewsDownloader:downloadFeed(feed, cookies, http_auth, feed_output_dir, include_images, message, enable_filter, filter_element, block_element)
|
||||||
local title_with_date = getTitleWithDate(feed)
|
local title_with_date = getTitleWithDate(feed)
|
||||||
local news_file_path = ("%s%s%s"):format(feed_output_dir,
|
local news_file_path = ("%s%s%s"):format(feed_output_dir,
|
||||||
title_with_date,
|
title_with_date,
|
||||||
@@ -742,7 +748,7 @@ function NewsDownloader:downloadFeed(feed, cookies, http_auth, feed_output_dir,
|
|||||||
extra_headers = { ["Authorization"] = "Basic " .. mime.b64((http_auth.username or "") .. ":" .. (http_auth.password or "")) }
|
extra_headers = { ["Authorization"] = "Basic " .. mime.b64((http_auth.username or "") .. ":" .. (http_auth.password or "")) }
|
||||||
end
|
end
|
||||||
local html = DownloadBackend:loadPage(link, cookies, extra_headers)
|
local html = DownloadBackend:loadPage(link, cookies, extra_headers)
|
||||||
DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message, enable_filter, filter_element)
|
DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message, enable_filter, filter_element, block_element)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -929,6 +935,7 @@ function NewsDownloader:editFeedAttribute(id, key, value)
|
|||||||
if key == FeedView.URL
|
if key == FeedView.URL
|
||||||
or key == FeedView.LIMIT
|
or key == FeedView.LIMIT
|
||||||
or key == FeedView.FILTER_ELEMENT
|
or key == FeedView.FILTER_ELEMENT
|
||||||
|
or key == FeedView.BLOCK_ELEMENT
|
||||||
or key == FeedView.HTTP_AUTH_USERNAME
|
or key == FeedView.HTTP_AUTH_USERNAME
|
||||||
or key == FeedView.HTTP_AUTH_PASSWORD then
|
or key == FeedView.HTTP_AUTH_PASSWORD then
|
||||||
|
|
||||||
@@ -947,6 +954,10 @@ function NewsDownloader:editFeedAttribute(id, key, value)
|
|||||||
title = _("Edit filter element.")
|
title = _("Edit filter element.")
|
||||||
description = _("Filter based on the given CSS selector. E.g.: name_of_css.element.class")
|
description = _("Filter based on the given CSS selector. E.g.: name_of_css.element.class")
|
||||||
input_type = "string"
|
input_type = "string"
|
||||||
|
elseif key == FeedView.BLOCK_ELEMENT then
|
||||||
|
title = _("Edit block element.")
|
||||||
|
description = _("Block element based on the given CSS selector. E.g.: name_of_css.element.class")
|
||||||
|
input_type = "string"
|
||||||
elseif key == FeedView.HTTP_AUTH_USERNAME then
|
elseif key == FeedView.HTTP_AUTH_USERNAME then
|
||||||
title = _("HTTP auth username")
|
title = _("HTTP auth username")
|
||||||
input_type = "string"
|
input_type = "string"
|
||||||
@@ -1139,6 +1150,18 @@ function NewsDownloader:updateFeedConfig(id, key, value)
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
elseif key == FeedView.BLOCK_ELEMENT then
|
||||||
|
if feed.block_element then
|
||||||
|
feed.block_element = value
|
||||||
|
else
|
||||||
|
table.insert(
|
||||||
|
feed,
|
||||||
|
{
|
||||||
|
"block_element",
|
||||||
|
value
|
||||||
|
}
|
||||||
|
)
|
||||||
|
end
|
||||||
elseif key == FeedView.HTTP_AUTH_USERNAME then
|
elseif key == FeedView.HTTP_AUTH_USERNAME then
|
||||||
feed.http_auth = feed.http_auth or { username = "", password = "" }
|
feed.http_auth = feed.http_auth or { username = "", password = "" }
|
||||||
feed.http_auth.username = value or ""
|
feed.http_auth.username = value or ""
|
||||||
|
|||||||
Reference in New Issue
Block a user