Skip to content

Commit e2f6b44

Browse files
Add content filtering package
Co-authored-by: SamMorrowDrums <4811358+SamMorrowDrums@users.noreply.github.com>
1 parent 015b8b6 commit e2f6b44

File tree

2 files changed

+306
-0
lines changed

2 files changed

+306
-0
lines changed

pkg/filtering/content_filter.go

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
package filtering
2+
3+
import (
4+
"regexp"
5+
"strings"
6+
)
7+
8+
var (
9+
// Invisible Unicode characters
10+
// This includes zero-width spaces, zero-width joiners, zero-width non-joiners,
11+
// bidirectional marks, and other invisible unicode characters
12+
invisibleCharsRegex = regexp.MustCompile(`[\x{200B}-\x{200F}\x{2028}-\x{202E}\x{2060}-\x{2064}\x{FEFF}]`)
13+
14+
// HTML comments
15+
htmlCommentsRegex = regexp.MustCompile(`<!--[\s\S]*?-->`)
16+
17+
// HTML elements that could contain hidden content
18+
// This is a simple approach that targets specific dangerous tags
19+
// Go's regexp doesn't support backreferences, so we list each tag explicitly
20+
htmlScriptRegex = regexp.MustCompile(`<script[^>]*>[\s\S]*?</script>`)
21+
htmlStyleRegex = regexp.MustCompile(`<style[^>]*>[\s\S]*?</style>`)
22+
htmlIframeRegex = regexp.MustCompile(`<iframe[^>]*>[\s\S]*?</iframe>`)
23+
htmlObjectRegex = regexp.MustCompile(`<object[^>]*>[\s\S]*?</object>`)
24+
htmlEmbedRegex = regexp.MustCompile(`<embed[^>]*>[\s\S]*?</embed>`)
25+
htmlSvgRegex = regexp.MustCompile(`<svg[^>]*>[\s\S]*?</svg>`)
26+
htmlMathRegex = regexp.MustCompile(`<math[^>]*>[\s\S]*?</math>`)
27+
htmlLinkRegex = regexp.MustCompile(`<link[^>]*>[\s\S]*?</link>`)
28+
29+
// HTML attributes that might be used for hiding content
30+
htmlAttributesRegex = regexp.MustCompile(`<[^>]*(?:style|data-[\w-]+|hidden|class)="[^"]*"[^>]*>`)
31+
32+
// Detect collapsed sections (details/summary)
33+
collapsedSectionsRegex = regexp.MustCompile(`<details>[\s\S]*?</details>`)
34+
35+
// Very small text (font-size or similar CSS tricks)
36+
smallTextRegex = regexp.MustCompile(`<[^>]*style="[^"]*font-size:\s*(?:0|0\.\d+|[0-3])(?:px|pt|em|%)[^"]*"[^>]*>[\s\S]*?</[^>]+>`)
37+
38+
// Excessive whitespace (more than 3 consecutive newlines)
39+
excessiveWhitespaceRegex = regexp.MustCompile(`\n{4,}`)
40+
)
41+
42+
// Config holds configuration for content filtering
43+
type Config struct {
44+
// DisableContentFiltering disables all content filtering when true
45+
DisableContentFiltering bool
46+
}
47+
48+
// DefaultConfig returns the default content filtering configuration
49+
func DefaultConfig() *Config {
50+
return &Config{
51+
DisableContentFiltering: false,
52+
}
53+
}
54+
55+
// FilterContent filters potentially hidden content from the input text
56+
// This includes invisible Unicode characters, HTML comments, and other methods of hiding content
57+
func FilterContent(input string, cfg *Config) string {
58+
if cfg != nil && cfg.DisableContentFiltering {
59+
return input
60+
}
61+
62+
if input == "" {
63+
return input
64+
}
65+
66+
// Process the input text through each filter
67+
result := input
68+
69+
// Remove invisible characters
70+
result = invisibleCharsRegex.ReplaceAllString(result, "")
71+
72+
// Replace HTML comments with a marker
73+
result = htmlCommentsRegex.ReplaceAllString(result, "[HTML_COMMENT]")
74+
75+
// Replace potentially dangerous HTML elements
76+
result = htmlScriptRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
77+
result = htmlStyleRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
78+
result = htmlIframeRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
79+
result = htmlObjectRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
80+
result = htmlEmbedRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
81+
result = htmlSvgRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
82+
result = htmlMathRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
83+
result = htmlLinkRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
84+
85+
// Replace HTML attributes that might be used for hiding
86+
result = htmlAttributesRegex.ReplaceAllStringFunc(result, cleanHTMLAttributes)
87+
88+
// Replace collapsed sections with visible indicator
89+
result = collapsedSectionsRegex.ReplaceAllStringFunc(result, makeCollapsedSectionVisible)
90+
91+
// Replace very small text with visible indicator
92+
result = smallTextRegex.ReplaceAllString(result, "[SMALL_TEXT]")
93+
94+
// Normalize excessive whitespace
95+
result = excessiveWhitespaceRegex.ReplaceAllString(result, "\n\n\n")
96+
97+
return result
98+
}
99+
100+
// cleanHTMLAttributes removes potentially dangerous attributes from HTML tags
101+
func cleanHTMLAttributes(tag string) string {
102+
// This is a simple implementation that removes style, data-* and hidden attributes
103+
// A more sophisticated implementation would parse the HTML and selectively remove attributes
104+
tagWithoutStyle := regexp.MustCompile(`\s+(?:style|data-[\w-]+|hidden|class)="[^"]*"`).ReplaceAllString(tag, "")
105+
return tagWithoutStyle
106+
}
107+
108+
// makeCollapsedSectionVisible transforms a <details> section to make it visible
109+
func makeCollapsedSectionVisible(detailsSection string) string {
110+
// Extract the summary if present
111+
summaryRegex := regexp.MustCompile(`<summary>(.*?)</summary>`)
112+
summaryMatches := summaryRegex.FindStringSubmatch(detailsSection)
113+
114+
summary := "Collapsed section"
115+
if len(summaryMatches) > 1 {
116+
summary = summaryMatches[1]
117+
}
118+
119+
// Extract the content (everything after </summary> and before </details>)
120+
parts := strings.SplitN(detailsSection, "</summary>", 2)
121+
content := detailsSection
122+
if len(parts) > 1 {
123+
content = parts[1]
124+
content = strings.TrimSuffix(content, "</details>")
125+
} else {
126+
// No summary tag found, remove the details tags
127+
content = strings.TrimPrefix(content, "<details>")
128+
content = strings.TrimSuffix(content, "</details>")
129+
}
130+
131+
// Format as a visible section
132+
return "\n\n**" + summary + ":**\n" + content + "\n\n"
133+
}

pkg/filtering/content_filter_test.go

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
package filtering
2+
3+
import (
4+
"testing"
5+
)
6+
7+
func TestFilterContent(t *testing.T) {
8+
tests := []struct {
9+
name string
10+
input string
11+
expected string
12+
cfg *Config
13+
}{
14+
{
15+
name: "Empty string",
16+
input: "",
17+
expected: "",
18+
cfg: DefaultConfig(),
19+
},
20+
{
21+
name: "Normal text without hidden content",
22+
input: "This is normal text without any hidden content.",
23+
expected: "This is normal text without any hidden content.",
24+
cfg: DefaultConfig(),
25+
},
26+
{
27+
name: "Text with invisible characters",
28+
input: "Hidden\u200Bcharacters\u200Bin\u200Bthis\u200Btext",
29+
expected: "Hiddencharactersinthistext",
30+
cfg: DefaultConfig(),
31+
},
32+
{
33+
name: "Text with HTML comments",
34+
input: "This has a <!-- hidden comment --> in it.",
35+
expected: "This has a [HTML_COMMENT] in it.",
36+
cfg: DefaultConfig(),
37+
},
38+
{
39+
name: "Text with HTML elements",
40+
input: "This has <script>alert('hidden')</script> scripts.",
41+
expected: "This has [HTML_ELEMENT] scripts.",
42+
cfg: DefaultConfig(),
43+
},
44+
{
45+
name: "Text with details/summary",
46+
input: "Collapsed content: <details><summary>Click me</summary>Hidden content</details>",
47+
expected: "Collapsed content: \n\n**Click me:**\nHidden content\n\n",
48+
cfg: DefaultConfig(),
49+
},
50+
{
51+
name: "Text with small font",
52+
input: "This has <span style=\"font-size:1px\">hidden tiny text</span> in it.",
53+
expected: "This has <span>hidden tiny text</span> in it.",
54+
cfg: DefaultConfig(),
55+
},
56+
{
57+
name: "Text with excessive whitespace",
58+
input: "Line 1\n\n\n\n\n\nLine 2",
59+
expected: "Line 1\n\n\nLine 2",
60+
cfg: DefaultConfig(),
61+
},
62+
{
63+
name: "Text with HTML attributes",
64+
input: "<p data-hidden=\"true\" style=\"display:none\">Hidden paragraph</p>",
65+
expected: "<p>Hidden paragraph</p>",
66+
cfg: DefaultConfig(),
67+
},
68+
{
69+
name: "Filtering disabled",
70+
input: "Hidden\u200Bcharacters and <!-- comments -->",
71+
expected: "Hidden\u200Bcharacters and <!-- comments -->",
72+
cfg: &Config{DisableContentFiltering: true},
73+
},
74+
{
75+
name: "Nil config uses default (filtering enabled)",
76+
input: "Hidden\u200Bcharacters",
77+
expected: "Hiddencharacters",
78+
cfg: nil,
79+
},
80+
{
81+
name: "Normal markdown with code blocks",
82+
input: "# Title\n\n```go\nfunc main() {\n fmt.Println(\"Hello, world!\")\n}\n```",
83+
expected: "# Title\n\n```go\nfunc main() {\n fmt.Println(\"Hello, world!\")\n}\n```",
84+
cfg: DefaultConfig(),
85+
},
86+
{
87+
name: "GitHub flavored markdown with tables",
88+
input: "| Header 1 | Header 2 |\n| -------- | -------- |\n| Cell 1 | Cell 2 |",
89+
expected: "| Header 1 | Header 2 |\n| -------- | -------- |\n| Cell 1 | Cell 2 |",
90+
cfg: DefaultConfig(),
91+
},
92+
}
93+
94+
for _, tc := range tests {
95+
t.Run(tc.name, func(t *testing.T) {
96+
result := FilterContent(tc.input, tc.cfg)
97+
if result != tc.expected {
98+
t.Errorf("FilterContent() = %q, want %q", result, tc.expected)
99+
}
100+
})
101+
}
102+
}
103+
104+
func TestMakeCollapsedSectionVisible(t *testing.T) {
105+
tests := []struct {
106+
name string
107+
input string
108+
expected string
109+
}{
110+
{
111+
name: "Simple details/summary",
112+
input: "<details><summary>Click me</summary>Hidden content</details>",
113+
expected: "\n\n**Click me:**\nHidden content\n\n",
114+
},
115+
{
116+
name: "Details without summary",
117+
input: "<details>Hidden content</details>",
118+
expected: "\n\n**Collapsed section:**\nHidden content\n\n",
119+
},
120+
{
121+
name: "Nested content",
122+
input: "<details><summary>Outer</summary>Content<details><summary>Inner</summary>Nested</details></details>",
123+
expected: "\n\n**Outer:**\nContent<details><summary>Inner</summary>Nested</details>\n\n",
124+
},
125+
}
126+
127+
for _, tc := range tests {
128+
t.Run(tc.name, func(t *testing.T) {
129+
result := makeCollapsedSectionVisible(tc.input)
130+
if result != tc.expected {
131+
t.Errorf("makeCollapsedSectionVisible() = %q, want %q", result, tc.expected)
132+
}
133+
})
134+
}
135+
}
136+
137+
func TestCleanHTMLAttributes(t *testing.T) {
138+
tests := []struct {
139+
name string
140+
input string
141+
expected string
142+
}{
143+
{
144+
name: "Tag with style attribute",
145+
input: "<p style=\"display:none\">Hidden</p>",
146+
expected: "<p>Hidden</p>",
147+
},
148+
{
149+
name: "Tag with data attribute",
150+
input: "<p data-hidden=\"true\">Hidden</p>",
151+
expected: "<p>Hidden</p>",
152+
},
153+
{
154+
name: "Tag with multiple attributes",
155+
input: "<p id=\"para\" style=\"display:none\" data-test=\"value\">Hidden</p>",
156+
expected: "<p id=\"para\">Hidden</p>",
157+
},
158+
{
159+
name: "Tag with allowed attributes",
160+
input: "<a href=\"https://example.com\" target=\"_blank\">Link</a>",
161+
expected: "<a href=\"https://example.com\" target=\"_blank\">Link</a>",
162+
},
163+
}
164+
165+
for _, tc := range tests {
166+
t.Run(tc.name, func(t *testing.T) {
167+
result := cleanHTMLAttributes(tc.input)
168+
if result != tc.expected {
169+
t.Errorf("cleanHTMLAttributes() = %q, want %q", result, tc.expected)
170+
}
171+
})
172+
}
173+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy