Skip to content

Commit 5674198

Browse files
committed
feat(enclosures): refactor enclosure handling to support multiple enclosures in articles internally
Signed-off-by: Gil Desmarais <git@desmarais.de>
1 parent b96d11d commit 5674198

File tree

10 files changed

+190
-82
lines changed

10 files changed

+190
-82
lines changed

lib/html2rss/html_extractor.rb

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def call
6161
description: extract_description,
6262
id: generate_id,
6363
published_at: extract_published_at,
64-
enclosure: extract_enclosure
64+
enclosures: extract_enclosures
6565
}
6666
end
6767

@@ -83,9 +83,7 @@ def find_main_anchor
8383
end
8484

8585
def extract_title
86-
return unless heading && (heading.children.empty? || heading.text)
87-
88-
self.class.extract_visible_text(heading)
86+
self.class.extract_visible_text(heading) if heading
8987
end
9088

9189
def heading
@@ -121,6 +119,6 @@ def generate_id
121119

122120
def extract_image = ImageExtractor.call(article_tag, base_url:)
123121
def extract_published_at = DateExtractor.call(article_tag)
124-
def extract_enclosure = EnclosureExtractor.call(article_tag, base_url).first
122+
def extract_enclosures = EnclosureExtractor.call(article_tag, base_url)
125123
end
126124
end

lib/html2rss/html_extractor/enclosure_extractor.rb

Lines changed: 88 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,95 @@
22

33
module Html2rss
44
class HtmlExtractor
5-
# Extracts video / audio content (to be used as enclosure) from an article_tag.
6-
# Extracts video and audio enclosures from an article tag.
7-
#
8-
# @param [Nokogiri::XML::Element] article_tag The HTML element containing the article.
9-
# @param [String] url The base URL to resolve relative URLs.
10-
# @return [Array<Hash>] Hash contains the enclosure url and type.
5+
##
6+
# Extracts enclosures from HTML tags using various strategies.
117
class EnclosureExtractor
12-
def self.call(article_tag, url)
13-
article_tag.css('video source[src], audio[src]').filter_map do |tag|
14-
src = tag['src'].to_s
15-
next if src.empty?
16-
17-
{
18-
url: Utils.build_absolute_url_from_relative(src, url),
19-
type: tag['type']
20-
}.compact
8+
def self.call(article_tag, base_url)
9+
[
10+
Extractors::Media,
11+
Extractors::Pdf,
12+
Extractors::Iframe,
13+
Extractors::Archive
14+
].flat_map { |strategy| strategy.call(article_tag, base_url:) }
15+
end
16+
end
17+
18+
module Extractors
19+
# Extracts image enclosures from HTML tags.
20+
# Uses the ImageExtractor to find the image source and returns it in a format suitable for RSS.
21+
class Image
22+
def self.call(article_tag, base_url:)
23+
if (img_src = ImageExtractor.call(article_tag, base_url:))
24+
{
25+
url: img_src,
26+
type: RssBuilder::Enclosure.guess_content_type_from_url(img_src, default: 'image/jpeg')
27+
}
28+
else
29+
[]
30+
end
31+
end
32+
end
33+
34+
# Extracts media enclosures (video/audio) from HTML tags.
35+
class Media
36+
def self.call(tag, base_url:)
37+
tag.css('video source[src], audio source[src], audio[src]').filter_map do |element|
38+
src = element['src'].to_s
39+
next if src.empty?
40+
41+
{
42+
url: Utils.build_absolute_url_from_relative(src, base_url),
43+
type: element['type']
44+
}
45+
end
46+
end
47+
end
48+
49+
# Extracts PDF enclosures from HTML tags.
50+
class Pdf
51+
def self.call(tag, base_url:)
52+
tag.css('a[href$=".pdf"]').map do |a|
53+
href = a['href'].to_s
54+
next if href.empty?
55+
56+
abs_url = Utils.build_absolute_url_from_relative(href, base_url)
57+
{
58+
url: abs_url,
59+
type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url)
60+
}
61+
end
62+
end
63+
end
64+
65+
# Extracts iframe enclosures from HTML tags.
66+
class Iframe
67+
def self.call(tag, base_url:)
68+
tag.css('iframe[src]').map do |iframe|
69+
src = iframe['src']
70+
abs_url = Utils.build_absolute_url_from_relative(src, base_url)
71+
{
72+
url: abs_url,
73+
type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url,
74+
default: 'text/html')
75+
}
76+
end
77+
end
78+
end
79+
80+
# Extracts archive enclosures (zip, tar.gz, tgz) from HTML tags.
81+
class Archive
82+
def self.call(tag, base_url:)
83+
tag.css('a[href$=".zip"], a[href$=".tar.gz"], a[href$=".tgz"]').map do |a|
84+
href = a['href'].to_s
85+
next if href.empty?
86+
87+
abs_url = Utils.build_absolute_url_from_relative(href, base_url)
88+
89+
{
90+
url: abs_url,
91+
type: 'application/zip'
92+
}
93+
end
2194
end
2295
end
2396
end

lib/html2rss/rendering/description_builder.rb

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,16 @@ module Html2rss
66
module Rendering
77
# Builds a sanitized article description from the base text, title, and optional media.
88
class DescriptionBuilder
9-
def initialize(base:, title:, url:, enclosure:, image:)
9+
def initialize(base:, title:, url:, enclosures:, image:)
1010
@base = base.to_s
1111
@title = title
1212
@url = url
13-
@enclosure = enclosure
13+
@enclosures = enclosures || []
1414
@image = image
1515
end
1616

1717
def call
18-
fragments = []
19-
fragments << media_renderer&.to_html
18+
fragments = Array(rendered_media)
2019
fragments << processed_base_description
2120

2221
result = fragments.compact.join("\n").strip
@@ -25,8 +24,10 @@ def call
2524

2625
private
2726

28-
def media_renderer
29-
MediaRenderer.for(enclosure: @enclosure, image: @image, title: @title)
27+
def rendered_media
28+
@enclosures.filter_map do |enclosure|
29+
MediaRenderer.for(enclosure:, image: @image, title: @title)&.to_html
30+
end
3031
end
3132

3233
def processed_base_description

lib/html2rss/rss_builder/article.rb

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class Article
1313
include Enumerable
1414
include Comparable
1515

16-
PROVIDED_KEYS = %i[id title description url image author guid published_at enclosure categories scraper].freeze
16+
PROVIDED_KEYS = %i[id title description url image author guid published_at enclosures categories scraper].freeze
1717

1818
##
1919
# Removes the specified pattern from the beginning of the text
@@ -78,7 +78,7 @@ def description
7878
base: @to_h[:description],
7979
title:,
8080
url:,
81-
enclosure:,
81+
enclosures:,
8282
image:
8383
).call
8484
end
@@ -102,11 +102,16 @@ def guid
102102
@guid ||= Zlib.crc32(fetch_guid).to_s(36).encode('utf-8')
103103
end
104104

105+
def enclosures
106+
@enclosures ||= Array(@to_h[:enclosures])
107+
.map { |enclosure| Html2rss::RssBuilder::Enclosure.new(**enclosure) }
108+
end
109+
105110
# @return [Html2rss::RssBuilder::Enclosure, nil]
106111
def enclosure
107112
return @enclosure if defined?(@enclosure)
108113

109-
case (object = @to_h[:enclosure])
114+
case (object = @to_h[:enclosures]&.first)
110115
when Hash
111116
@enclosure = Html2rss::RssBuilder::Enclosure.new(**object)
112117
when nil

lib/html2rss/selectors/post_processors/sanitize_html.rb

Lines changed: 49 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,49 @@ module PostProcessors
4040
# Would return:
4141
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
4242
class SanitizeHtml < Base
43+
# @see https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referrer-Policy
44+
TAG_ATTRIBUTES = {
45+
'a' => {
46+
'rel' => 'nofollow noopener noreferrer',
47+
'target' => '_blank'
48+
},
49+
50+
'area' => {
51+
'rel' => 'nofollow noopener noreferrer',
52+
'target' => '_blank'
53+
},
54+
55+
'img' => {
56+
'referrerpolicy' => 'no-referrer',
57+
'crossorigin' => 'anonymous',
58+
'loading' => 'lazy',
59+
'decoding' => 'async'
60+
},
61+
62+
'iframe' => {
63+
'referrerpolicy' => 'no-referrer',
64+
'crossorigin' => 'anonymous',
65+
'loading' => 'lazy',
66+
'sandbox' => 'allow-same-origin',
67+
'src' => true,
68+
'width' => true,
69+
'height' => true
70+
},
71+
72+
'video' => {
73+
'referrerpolicy' => 'no-referrer',
74+
'crossorigin' => 'anonymous',
75+
'preload' => 'none',
76+
'playsinline' => 'true',
77+
'controls' => 'true'
78+
},
79+
80+
'audio' => {
81+
'referrerpolicy' => 'no-referrer',
82+
'crossorigin' => 'anonymous',
83+
'preload' => 'none'
84+
}
85+
}.freeze
4386
def self.validate_args!(value, context)
4487
assert_type value, String, :value, context:
4588
end
@@ -50,7 +93,7 @@ def self.validate_args!(value, context)
5093
# @param url [String, Addressable::URI]
5194
# @return [String, nil]
5295
def self.get(html, url)
53-
return nil if html.to_s.empty?
96+
return nil if String(html).empty?
5497

5598
new(html, config: { channel: { url: } }).get
5699
end
@@ -70,30 +113,18 @@ def channel_url = context.dig(:config, :channel, :url)
70113

71114
##
72115
# @return [Sanitize::Config]
73-
def sanitize_config
74-
Sanitize::Config.merge(
116+
def sanitize_config # rubocop:disable Metrics/MethodLength
117+
config = Sanitize::Config.merge(
75118
Sanitize::Config::RELAXED,
76119
attributes: { all: %w[dir lang alt title translate] },
77-
add_attributes:,
120+
add_attributes: TAG_ATTRIBUTES,
78121
transformers: [
79122
method(:transform_urls_to_absolute_ones),
80123
method(:wrap_img_in_a)
81124
]
82125
)
83-
end
84-
85-
##
86-
# @return [Hash]
87-
# @see https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referrer-Policy
88-
def add_attributes
89-
{
90-
'a' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
91-
'area' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
92-
'img' => { 'referrerpolicy' => 'no-referrer' },
93-
'iframe' => { 'referrerpolicy' => 'no-referrer' },
94-
'video' => { 'referrerpolicy' => 'no-referrer' },
95-
'audio' => { 'referrerpolicy' => 'no-referrer' }
96-
}
126+
config[:elements].push('audio', 'video', 'source')
127+
config
97128
end
98129

99130
##

spec/lib/html2rss/auto_source/scraper/html_spec.rb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
description: 'Article 1 Headline Teaser for article 1. Read more',
5656
id: '/article1/',
5757
published_at: nil,
58-
enclosure: nil }
58+
enclosures: [] }
5959
end
6060
let(:second_article) do
6161
{ title: 'Article 2 Headline',
@@ -64,7 +64,7 @@
6464
description: 'Article 2 Headline Teaser for article 2. Read more',
6565
id: '/article2/',
6666
published_at: nil,
67-
enclosure: nil }
67+
enclosures: [] }
6868
end
6969

7070
it 'yields articles' do
@@ -100,7 +100,7 @@
100100
description: '[Plonk]',
101101
id: '/',
102102
published_at: nil,
103-
enclosure: nil }
103+
enclosures: [] }
104104
end
105105

106106
let(:second_article) do
@@ -111,7 +111,7 @@
111111
description: 'Bla bla bla',
112112
id: '/',
113113
published_at: nil,
114-
enclosure: nil
114+
enclosures: []
115115
}
116116
end
117117

spec/lib/html2rss/html_extractor_spec.rb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,10 @@
4545
published_at: an_instance_of(DateTime),
4646
url: Addressable::URI.parse('https://example.com/sample'),
4747
image: an_instance_of(Addressable::URI),
48-
enclosure: a_hash_including(
48+
enclosures: [a_hash_including(
4949
url: an_instance_of(Addressable::URI),
5050
type: 'video/mp4'
51-
)
51+
)]
5252
)
5353

5454
expect(article_hash[:published_at].to_s).to eq '2024-02-24T12:00:00-03:00'
@@ -92,7 +92,7 @@
9292
image: be_a(Addressable::URI),
9393
description: 'FCK PTN Sample description',
9494
id: nil,
95-
published_at: be_a(DateTime), enclosure: nil }
95+
published_at: be_a(DateTime), enclosures: [] }
9696
end
9797

9898
it 'returns the details' do

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy