Skip to content

Commit abb13fc

Browse files
committed
feat(enclosures): refactor enclosure handling to support multiple enclosures in articles internally
1 parent b96d11d commit abb13fc

File tree

8 files changed

+127
-50
lines changed

8 files changed

+127
-50
lines changed

lib/html2rss/html_extractor.rb

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def call
6161
description: extract_description,
6262
id: generate_id,
6363
published_at: extract_published_at,
64-
enclosure: extract_enclosure
64+
enclosures: extract_enclosures
6565
}
6666
end
6767

@@ -83,9 +83,7 @@ def find_main_anchor
8383
end
8484

8585
def extract_title
86-
return unless heading && (heading.children.empty? || heading.text)
87-
88-
self.class.extract_visible_text(heading)
86+
self.class.extract_visible_text(heading) if heading
8987
end
9088

9189
def heading
@@ -121,6 +119,6 @@ def generate_id
121119

122120
def extract_image = ImageExtractor.call(article_tag, base_url:)
123121
def extract_published_at = DateExtractor.call(article_tag)
124-
def extract_enclosure = EnclosureExtractor.call(article_tag, base_url).first
122+
def extract_enclosures = EnclosureExtractor.call(article_tag, base_url)
125123
end
126124
end

lib/html2rss/html_extractor/enclosure_extractor.rb

Lines changed: 88 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,95 @@
22

33
module Html2rss
44
class HtmlExtractor
5-
# Extracts video / audio content (to be used as enclosure) from an article_tag.
6-
# Extracts video and audio enclosures from an article tag.
7-
#
8-
# @param [Nokogiri::XML::Element] article_tag The HTML element containing the article.
9-
# @param [String] url The base URL to resolve relative URLs.
10-
# @return [Array<Hash>] Hash contains the enclosure url and type.
5+
##
6+
# Extracts enclosures from HTML tags using various strategies.
117
class EnclosureExtractor
12-
def self.call(article_tag, url)
13-
article_tag.css('video source[src], audio[src]').filter_map do |tag|
14-
src = tag['src'].to_s
15-
next if src.empty?
16-
17-
{
18-
url: Utils.build_absolute_url_from_relative(src, url),
19-
type: tag['type']
20-
}.compact
8+
def self.call(article_tag, base_url)
9+
[
10+
Extractors::Media,
11+
Extractors::Pdf,
12+
Extractors::Iframe,
13+
Extractors::Archive
14+
].flat_map { |strategy| strategy.call(article_tag, base_url:) }
15+
end
16+
end
17+
18+
module Extractors
19+
# Extracts image enclosures from HTML tags.
20+
# Uses the ImageExtractor to find the image source and returns it in a format suitable for RSS.
21+
class Image
22+
def self.call(article_tag, base_url:)
23+
if (img_src = ImageExtractor.call(article_tag, base_url:))
24+
{
25+
url: img_src,
26+
type: RssBuilder::Enclosure.guess_content_type_from_url(img_src, default: 'image/jpeg')
27+
}
28+
else
29+
[]
30+
end
31+
end
32+
end
33+
34+
# Extracts media enclosures (video/audio) from HTML tags.
35+
class Media
36+
def self.call(tag, base_url:)
37+
tag.css('video source[src], audio[src]').filter_map do |element|
38+
src = element['src'].to_s
39+
next if src.empty?
40+
41+
{
42+
url: Utils.build_absolute_url_from_relative(src, base_url),
43+
type: element['type']
44+
}
45+
end
46+
end
47+
end
48+
49+
# Extracts PDF enclosures from HTML tags.
50+
class Pdf
51+
def self.call(tag, base_url:)
52+
tag.css('a[href$=".pdf"]').map do |a|
53+
href = a['href'].to_s
54+
next if href.empty?
55+
56+
abs_url = Utils.build_absolute_url_from_relative(href, base_url)
57+
{
58+
url: abs_url,
59+
type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url)
60+
}
61+
end
62+
end
63+
end
64+
65+
# Extracts iframe enclosures from HTML tags.
66+
class Iframe
67+
def self.call(tag, base_url:)
68+
tag.css('iframe[src]').map do |iframe|
69+
src = iframe['src']
70+
abs_url = Utils.build_absolute_url_from_relative(src, base_url)
71+
{
72+
url: abs_url,
73+
type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url,
74+
default: 'text/html')
75+
}
76+
end
77+
end
78+
end
79+
80+
# Extracts archive enclosures (zip, tar.gz, tgz) from HTML tags.
81+
class Archive
82+
def self.call(tag, base_url:)
83+
tag.css('a[href$=".zip"], a[href$=".tar.gz"], a[href$=".tgz"]').map do |a|
84+
href = a['href'].to_s
85+
next if href.empty?
86+
87+
abs_url = Utils.build_absolute_url_from_relative(href, base_url)
88+
89+
{
90+
url: abs_url,
91+
type: 'application/zip'
92+
}
93+
end
2194
end
2295
end
2396
end

lib/html2rss/rendering/description_builder.rb

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,16 @@ module Html2rss
66
module Rendering
77
# Builds a sanitized article description from the base text, title, and optional media.
88
class DescriptionBuilder
9-
def initialize(base:, title:, url:, enclosure:, image:)
9+
def initialize(base:, title:, url:, enclosures:, image:)
1010
@base = base.to_s
1111
@title = title
1212
@url = url
13-
@enclosure = enclosure
13+
@enclosures = enclosures || []
1414
@image = image
1515
end
1616

1717
def call
18-
fragments = []
19-
fragments << media_renderer&.to_html
18+
fragments = Array(rendered_media)
2019
fragments << processed_base_description
2120

2221
result = fragments.compact.join("\n").strip
@@ -25,8 +24,10 @@ def call
2524

2625
private
2726

28-
def media_renderer
29-
MediaRenderer.for(enclosure: @enclosure, image: @image, title: @title)
27+
def rendered_media
28+
@enclosures.filter_map do |enclosure|
29+
MediaRenderer.for(enclosure:, image: @image, title: @title)&.to_html
30+
end
3031
end
3132

3233
def processed_base_description

lib/html2rss/rss_builder/article.rb

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class Article
1313
include Enumerable
1414
include Comparable
1515

16-
PROVIDED_KEYS = %i[id title description url image author guid published_at enclosure categories scraper].freeze
16+
PROVIDED_KEYS = %i[id title description url image author guid published_at enclosures categories scraper].freeze
1717

1818
##
1919
# Removes the specified pattern from the beginning of the text
@@ -78,7 +78,7 @@ def description
7878
base: @to_h[:description],
7979
title:,
8080
url:,
81-
enclosure:,
81+
enclosures:,
8282
image:
8383
).call
8484
end
@@ -102,11 +102,16 @@ def guid
102102
@guid ||= Zlib.crc32(fetch_guid).to_s(36).encode('utf-8')
103103
end
104104

105+
def enclosures
106+
@enclosures ||= Array(@to_h[:enclosures])
107+
.map { |enclosure| Html2rss::RssBuilder::Enclosure.new(**enclosure) }
108+
end
109+
105110
# @return [Html2rss::RssBuilder::Enclosure, nil]
106111
def enclosure
107112
return @enclosure if defined?(@enclosure)
108113

109-
case (object = @to_h[:enclosure])
114+
case (object = @to_h[:enclosures]&.first)
110115
when Hash
111116
@enclosure = Html2rss::RssBuilder::Enclosure.new(**object)
112117
when nil

spec/lib/html2rss/auto_source/scraper/html_spec.rb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
description: 'Article 1 Headline Teaser for article 1. Read more',
5656
id: '/article1/',
5757
published_at: nil,
58-
enclosure: nil }
58+
enclosures: [] }
5959
end
6060
let(:second_article) do
6161
{ title: 'Article 2 Headline',
@@ -64,7 +64,7 @@
6464
description: 'Article 2 Headline Teaser for article 2. Read more',
6565
id: '/article2/',
6666
published_at: nil,
67-
enclosure: nil }
67+
enclosures: [] }
6868
end
6969

7070
it 'yields articles' do
@@ -100,7 +100,7 @@
100100
description: '[Plonk]',
101101
id: '/',
102102
published_at: nil,
103-
enclosure: nil }
103+
enclosures: [] }
104104
end
105105

106106
let(:second_article) do
@@ -111,7 +111,7 @@
111111
description: 'Bla bla bla',
112112
id: '/',
113113
published_at: nil,
114-
enclosure: nil
114+
enclosures: []
115115
}
116116
end
117117

spec/lib/html2rss/html_extractor_spec.rb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,10 @@
4545
published_at: an_instance_of(DateTime),
4646
url: Addressable::URI.parse('https://example.com/sample'),
4747
image: an_instance_of(Addressable::URI),
48-
enclosure: a_hash_including(
48+
enclosures: [a_hash_including(
4949
url: an_instance_of(Addressable::URI),
5050
type: 'video/mp4'
51-
)
51+
)]
5252
)
5353

5454
expect(article_hash[:published_at].to_s).to eq '2024-02-24T12:00:00-03:00'
@@ -92,7 +92,7 @@
9292
image: be_a(Addressable::URI),
9393
description: 'FCK PTN Sample description',
9494
id: nil,
95-
published_at: be_a(DateTime), enclosure: nil }
95+
published_at: be_a(DateTime), enclosures: [] }
9696
end
9797

9898
it 'returns the details' do

spec/lib/html2rss/rendering/description_builder_spec.rb

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
RSpec.describe Html2rss::Rendering::DescriptionBuilder do
77
describe '#call' do
88
context 'when base is plain text' do
9-
subject(:description) { described_class.new(base:, title: 'Sample instance', url: 'http://example.com', enclosure: nil, image: nil).call }
9+
subject(:description) { described_class.new(base:, title: 'Sample instance', url: 'http://example.com', enclosures: nil, image: nil).call }
1010

1111
let(:base) { 'By John Doe' }
1212

@@ -17,7 +17,7 @@
1717

1818
context 'when base contains HTML' do
1919
subject(:description) do
20-
described_class.new(base:, title: 'Sample instance', url:, enclosure: nil, image: nil).call
20+
described_class.new(base:, title: 'Sample instance', url:, enclosures: nil, image: nil).call
2121
end
2222

2323
let(:base) { '<b>Some bold text</b>' }
@@ -34,7 +34,7 @@
3434
end
3535

3636
context 'when base starts with the title' do
37-
subject(:description) { described_class.new(base:, title: 'Sample instance', url: 'http://example.com', enclosure: nil, image: nil).call }
37+
subject(:description) { described_class.new(base:, title: 'Sample instance', url: 'http://example.com', enclosures: nil, image: nil).call }
3838

3939
let(:base) { 'Sample instance By John Doe' }
4040

@@ -44,7 +44,7 @@
4444
end
4545

4646
context 'when base is empty' do
47-
subject(:description) { described_class.new(base:, title: 'Sample instance', url: 'http://example.com', enclosure: nil, image: nil).call }
47+
subject(:description) { described_class.new(base:, title: 'Sample instance', url: 'http://example.com', enclosures: nil, image: nil).call }
4848

4949
let(:base) { '' }
5050

@@ -55,13 +55,13 @@
5555

5656
context 'when enclosure is an image' do
5757
subject(:doc) do
58-
html = described_class.new(base:, title: 'Sample instance', url: 'http://example.com', enclosure:,
58+
html = described_class.new(base:, title: 'Sample instance', url: 'http://example.com', enclosures:,
5959
image: nil).call
6060
Nokogiri::HTML.fragment(html)
6161
end
6262

6363
let(:base) { 'Caption' }
64-
let(:enclosure) { instance_double(Html2rss::RssBuilder::Enclosure, url: 'http://example.com/image.jpg', type: 'image/jpeg') }
64+
let(:enclosures) { [instance_double(Html2rss::RssBuilder::Enclosure, url: 'http://example.com/image.jpg', type: 'image/jpeg')] }
6565

6666
it 'renders <img> with attributes', :aggregate_failures do
6767
img = doc.at_css('img')
@@ -73,7 +73,7 @@
7373

7474
context 'when fallback image is present' do
7575
subject(:doc) do
76-
html = described_class.new(base:, title: 'Sample instance', url: 'http://example.com', enclosure: nil,
76+
html = described_class.new(base:, title: 'Sample instance', url: 'http://example.com', enclosures: nil,
7777
image:).call
7878
Nokogiri::HTML.fragment(html)
7979
end
@@ -89,13 +89,13 @@
8989

9090
context 'when enclosure is a video' do
9191
subject(:doc) do
92-
html = described_class.new(base:, title: 'Sample instance', url: 'http://example.com', enclosure:,
92+
html = described_class.new(base:, title: 'Sample instance', url: 'http://example.com', enclosures:,
9393
image: nil).call
9494
Nokogiri::HTML.fragment(html)
9595
end
9696

9797
let(:base) { 'Watch this' }
98-
let(:enclosure) { instance_double(Html2rss::RssBuilder::Enclosure, url: 'http://example.com/video.mp4', type: 'video/mp4') }
98+
let(:enclosures) { [instance_double(Html2rss::RssBuilder::Enclosure, url: 'http://example.com/video.mp4', type: 'video/mp4')] }
9999

100100
it 'renders <video> and <source>', :aggregate_failures do # rubocop:disable RSpec/ExampleLength
101101
video = doc.at_css('video')
@@ -109,13 +109,13 @@
109109

110110
context 'when enclosure is audio' do
111111
subject(:doc) do
112-
html = described_class.new(base:, title: 'Sample instance', url: 'http://example.com', enclosure:,
112+
html = described_class.new(base:, title: 'Sample instance', url: 'http://example.com', enclosures:,
113113
image: nil).call
114114
Nokogiri::HTML.fragment(html)
115115
end
116116

117117
let(:base) { 'Listen to this' }
118-
let(:enclosure) { instance_double(Html2rss::RssBuilder::Enclosure, url: 'http://example.com/audio.mp3', type: 'audio/mpeg') }
118+
let(:enclosures) { [instance_double(Html2rss::RssBuilder::Enclosure, url: 'http://example.com/audio.mp3', type: 'audio/mpeg')] }
119119

120120
it 'renders <audio> and <source>', :aggregate_failures do # rubocop:disable RSpec/ExampleLength
121121
audio = doc.at_css('audio')
@@ -129,13 +129,13 @@
129129

130130
context 'when enclosure is a PDF' do
131131
subject(:doc) do
132-
html = described_class.new(base:, title: 'Sample instance', url: 'http://example.com', enclosure:,
132+
html = described_class.new(base:, title: 'Sample instance', url: 'http://example.com', enclosures:,
133133
image: nil).call
134134
Nokogiri::HTML.fragment(html)
135135
end
136136

137137
let(:base) { 'See this document' }
138-
let(:enclosure) { instance_double(Html2rss::RssBuilder::Enclosure, url: 'http://example.com/doc.pdf', type: 'application/pdf') }
138+
let(:enclosures) { [instance_double(Html2rss::RssBuilder::Enclosure, url: 'http://example.com/doc.pdf', type: 'application/pdf')] }
139139

140140
it 'renders <iframe>', :aggregate_failures do
141141
iframe = doc.at_css('iframe')

spec/lib/html2rss/rss_builder/article_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050

5151
it 'calls the DescriptionBuilder' do
5252
expect(Html2rss::Rendering::DescriptionBuilder).to have_received(:new)
53-
.with(base: 'By John Doe', title: 'Sample instance', url: instance.url, enclosure: nil, image: nil)
53+
.with(base: 'By John Doe', title: 'Sample instance', url: instance.url, enclosures: [], image: nil)
5454
end
5555
end
5656

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy