[NewsDownloader] Added an HTML filter through a CSS selector (#6228)

Fixes #6185.
2025-12-13 20:36:53 +01:00 · 2020-06-04 20:58:14 +00:00
parent 6b5d50a8af
commit b741fcebac
3 changed files with 70 additions and 15 deletions
--- a/plugins/newsdownloader.koplugin/epubdownloadbackend.lua
+++ b/plugins/newsdownloader.koplugin/epubdownloadbackend.lua
@@ -24,6 +24,54 @@ local max_redirects = 5; --prevent infinite redirects
 local TIMEOUT_CODE = "timeout" -- from socket.lua
 local MAXTIME_CODE = "maxtime reached" -- from sink_table_with_maxtime

+-- filter HTML using CSS selector
+local function filter(text, element)
+    local htmlparser = require("htmlparser")
+    local root = htmlparser.parse(text, 5000)
+    local filtered = nil
+    local selectors = {
+        "main",
+        "article",
+        "div#main",
+        "#main-article",
+        ".main-content",
+        "#body",
+        "#content",
+        ".content",
+        "div#article",
+        "div.article",
+        "div.post",
+        "div.post-outer",
+        ".l-root",
+        ".content-container",
+        ".StandardArticleBody_body",
+        "div#article-inner",
+        "div#newsstorytext",
+        "div.general",
+        }
+    if element then
+        table.insert(selectors, 1, element)
+    end
+    for _, sel in ipairs(selectors) do
+       local elements = root:select(sel)
+       if elements then
+           for _, e in ipairs(elements) do
+               filtered = e:getcontent()
+               if filtered then
+                   break
+               end
+           end
+           if filtered then
+               break
+           end
+       end
+    end
+    if not filtered then
+        return text
+    end
+    return "<!DOCTYPE html><html><head></head><body>" .. filtered .. "</body></html>"
+end
+
 -- Sink that stores into a table, aborting if maxtime has elapsed
 local function sink_table_with_maxtime(t, maxtime)
    -- Start counting as soon as this sink is created
@@ -181,15 +229,13 @@ local ext_to_mimetype = {
    ttf = "application/truetype",
    woff = "application/font-woff",
 }
-
 -- Create an epub file (with possibly images)
-function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message)
+function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message, filter_enable, filter_element)
    logger.dbg("EpubDownloadBackend:createEpub(", epub_path, ")")
    -- Use Trapper to display progress and ask questions through the UI.
    -- We need to have been Trapper.wrap()'ed for UI to be used, otherwise
    -- Trapper:info() and Trapper:confirm() will just use logger.
    local UI = require("ui/trapper")
-
    -- We may need to build absolute urls for non-absolute links and images urls
    local base_url = socket_url.parse(url)

@@ -201,7 +247,7 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me
    -- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
    -- should it changes if content is updated (as now, including the wikipedia revisionId),
    -- or should it stays the same even if revid changes (content of the same book updated).
-
+    if filter_enable then html = filter(html, filter_element) end
    local images = {}
    local seen_images = {}
    local imagenum = 1
--- a/plugins/newsdownloader.koplugin/feed_config.lua
+++ b/plugins/newsdownloader.koplugin/feed_config.lua
@@ -21,12 +21,19 @@ return {--do NOT change this line
 -- 'include_images=false' - means ignore any images, only download the text (faster download, smaller file sizes)
 -- default value is 'false' (if no 'include_images' entry)

+ -- 'enable_filter=true' - means filter using a CSS selector to delimit part of the page to just that (does not apply if download_full_article=false)
+ -- 'enable_filter=false' - means no such filtering and including the full page
+ -- default value is 'false'
+
+ -- 'filter_element="name_of_css.element.class" - means to filter the chosen CSS selector, it can be easily picked using a modern web browser
+ -- The default value is empty. The default list of common selectors is used as fallback if this value is set.
+
 -- comment out line ("--" at line start) to stop downloading source


 -- LIST YOUR FEEDS HERE:

- { "http://feeds.reuters.com/Reuters/worldNews?format=xml", limit = 2, download_full_article=true},
+ { "http://feeds.reuters.com/Reuters/worldNews?format=xml", limit = 2, download_full_article=true, include_images=true, enable_filter=true},

 { "https://www.pcworld.com/index.rss", limit = 7 , download_full_article=false},

--- a/plugins/newsdownloader.koplugin/main.lua
+++ b/plugins/newsdownloader.koplugin/main.lua
@@ -199,10 +199,12 @@ function NewsDownloader:loadConfigAndProcessFeeds()
        local limit = feed.limit
        local download_full_article = feed.download_full_article == nil or feed.download_full_article
        local include_images = not never_download_images and feed.include_images
+        local enable_filter = feed.enable_filter or feed.enable_filter == nil
+        local filter_element = feed.filter_element or feed.filter_element == nil
        if url and limit then
            local feed_message = T(_("Processing %1/%2:\n%3"), idx, total_feed_entries, BD.url(url))
            UI:info(feed_message)
-            NewsDownloader:processFeedSource(url, tonumber(limit), unsupported_feeds_urls, download_full_article, include_images, feed_message)
+            NewsDownloader:processFeedSource(url, tonumber(limit), unsupported_feeds_urls, download_full_article, include_images, feed_message, enable_filter, filter_element)
        else
            logger.warn('NewsDownloader: invalid feed config entry', feed)
        end
@@ -230,7 +232,7 @@ function NewsDownloader:loadConfigAndProcessFeedsWithUI()
    end)
 end

-function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, download_full_article, include_images, message)
+function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, download_full_article, include_images, message, enable_filter, filter_element)

    local ok, response = pcall(function()
        return DownloadBackend:getResponseAsString(url)
@@ -250,11 +252,11 @@ function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, do

    if is_atom then
        ok = pcall(function()
-            return self:processAtom(feeds, limit, download_full_article, include_images, message)
+            return self:processAtom(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
        end)
    elseif is_rss then
        ok = pcall(function()
-            return self:processRSS(feeds, limit, download_full_article, include_images, message)
+            return self:processRSS(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
        end)
    end
    if not ok or (not is_rss and not is_atom) then
@@ -280,7 +282,7 @@ function NewsDownloader:deserializeXMLString(xml_str)
    return xmlhandler.root
 end

-function NewsDownloader:processAtom(feeds, limit, download_full_article, include_images, message)
+function NewsDownloader:processAtom(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
    local feed_output_dir = string.format("%s%s/",
                                          news_download_dir_path,
                                          util.getSafeFilename(getFeedTitle(feeds.feed.title)))
@@ -294,14 +296,14 @@ function NewsDownloader:processAtom(feeds, limit, download_full_article, include
        end
        local article_message = T(_("%1\n\nFetching article %2/%3:"), message, index, limit == 0 and #feeds.rss.channel.item or limit)
        if download_full_article then
-            self:downloadFeed(feed, feed_output_dir, include_images, article_message)
+            self:downloadFeed(feed, feed_output_dir, include_images, article_message, enable_filter, filter_element)
        else
            self:createFromDescription(feed, feed.content[1], feed_output_dir, include_images, article_message)
        end
    end
 end

-function NewsDownloader:processRSS(feeds, limit, download_full_article, include_images, message)
+function NewsDownloader:processRSS(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
    local feed_output_dir = ("%s%s/"):format(
        news_download_dir_path, util.getSafeFilename(util.htmlEntitiesToUtf8(feeds.rss.channel.title)))
    if not lfs.attributes(feed_output_dir, "mode") then
@@ -314,7 +316,7 @@ function NewsDownloader:processRSS(feeds, limit, download_full_article, include_
        end
        local article_message = T(_("%1\n\nFetching article %2/%3:"), message, index, limit == 0 and #feeds.rss.channel.item or limit)
        if download_full_article then
-            self:downloadFeed(feed, feed_output_dir, include_images, article_message)
+            self:downloadFeed(feed, feed_output_dir, include_images, article_message, enable_filter, filter_element)
        else
            self:createFromDescription(feed, feed.description, feed_output_dir, include_images, article_message)
        end
@@ -341,7 +343,7 @@ local function getTitleWithDate(feed)
    return title
 end

-function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, message)
+function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, message, enable_filter, filter_element)
    local title_with_date = getTitleWithDate(feed)
    local news_file_path = ("%s%s%s"):format(feed_output_dir,
                                             title_with_date,
@@ -355,7 +357,7 @@ function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, mess
        local article_message = T(_("%1\n%2"), message, title_with_date)
        local link = getFeedLink(feed.link)
        local html = DownloadBackend:loadPage(link)
-        DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message)
+        DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message, enable_filter, filter_element)
    end
 end