-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcontent.go
More file actions
218 lines (199 loc) · 6.02 KB
/
content.go
File metadata and controls
218 lines (199 loc) · 6.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
package defuddle
import (
"log/slog"
"sort"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/dotcommander/defuddle/internal/constants"
"github.com/dotcommander/defuddle/internal/scoring"
"github.com/dotcommander/defuddle/internal/text"
)
// findMainContent finds the main content element
// JavaScript original code:
//
// private findMainContent(doc: Document): Element | null {
// // Try entry point elements first
// for (const selector of ENTRY_POINT_ELEMENTS) {
// const element = doc.querySelector(selector);
// if (element) {
// return element;
// }
// }
//
// // Try table-based content
// const tableContent = this.findTableBasedContent(doc);
// if (tableContent) {
// return tableContent;
// }
//
// // Try content scoring
// const scoredContent = this.findContentByScoring(doc);
// if (scoredContent) {
// return scoredContent;
// }
//
// return null;
// }
//
// contentCandidate represents a scored entry point match.
type contentCandidate struct {
element *goquery.Selection
score float64
selectorIndex int
}
func (d *Defuddle) findMainContent(doc *goquery.Document) *goquery.Selection {
entryPoints := constants.GetEntryPointElements()
var candidates []contentCandidate
// Score ALL matches from ALL entry point selectors
for i, selector := range entryPoints {
doc.Find(selector).Each(func(_ int, element *goquery.Selection) {
// Base score from selector priority (earlier = higher)
score := float64(len(entryPoints)-i) * 40
// Add content-based score
score += scoring.ScoreElement(element)
candidates = append(candidates, contentCandidate{
element: element,
score: score,
selectorIndex: i,
})
})
}
if len(candidates) == 0 {
// Fall back to scoring block elements
scoredContent := d.findContentByScoring(doc)
if scoredContent != nil {
if d.debug {
slog.Debug("Found main content using scoring")
}
return scoredContent
}
return nil
}
// Sort by score descending
sort.Slice(candidates, func(a, b int) bool {
return candidates[a].score > candidates[b].score
})
if d.debug {
for _, c := range candidates {
tag := goquery.NodeName(c.element)
cls := c.element.AttrOr("class", "")
id := c.element.AttrOr("id", "")
slog.Debug("Content candidate",
"tag", tag, "class", cls, "id", id,
"score", c.score, "selectorIndex", c.selectorIndex)
}
}
// If we only matched body, try table-based detection
if len(candidates) == 1 && strings.EqualFold(goquery.NodeName(candidates[0].element), "body") {
tableContent := d.findTableBasedContent(doc)
if tableContent != nil {
if d.debug {
slog.Debug("Found main content using table-based detection")
}
return tableContent
}
}
// If the top candidate contains a child candidate that matched a
// higher-priority selector (lower index), prefer the more specific child.
// This prevents e.g. <main> from winning over a contained <article>
// just because sibling noise inflates the parent's content score.
top := candidates[0]
best := top
// Don't descend into child on listing pages (multiple articles)
articleCount := top.element.Find("article").Length()
if articleCount < 3 {
for i := 1; i < len(candidates); i++ {
child := candidates[i]
childText := strings.TrimSpace(child.element.Text())
childWords := text.CountWords(childText)
if child.selectorIndex < best.selectorIndex && scoring.NodeContains(best.element, child.element) && childWords > 50 {
best = child
}
}
}
if d.debug {
tag := goquery.NodeName(best.element)
slog.Debug("Selected main content", "tag", tag, "score", best.score)
}
return best.element
}
// findTableBasedContent finds content in table-based layouts
// JavaScript original code:
//
// private findTableBasedContent(doc: Document): Element | null {
// const tables = doc.querySelectorAll('table');
// let bestTable: Element | null = null;
// let bestScore = 0;
//
// tables.forEach(table => {
// const cells = table.querySelectorAll('td');
// cells.forEach(cell => {
// const score = ContentScorer.scoreElement(cell);
// if (score > bestScore) {
// bestScore = score;
// bestTable = cell;
// }
// });
// });
//
// return bestScore > 50 ? bestTable : null;
// }
func (d *Defuddle) findTableBasedContent(doc *goquery.Document) *goquery.Selection {
// Pre-guard: only try table-based extraction for old-style table layouts
tables := doc.Find("table")
hasTableLayout := false
tables.Each(func(_ int, table *goquery.Selection) {
if hasTableLayout {
return
}
width, exists := table.Attr("width")
if exists {
if w, err := strconv.Atoi(width); err == nil && w > 400 {
hasTableLayout = true
return
}
}
if align, _ := table.Attr("align"); strings.EqualFold(align, "center") {
hasTableLayout = true
return
}
cls := strings.ToLower(table.AttrOr("class", ""))
if strings.Contains(cls, "content") || strings.Contains(cls, "article") {
hasTableLayout = true
}
})
if !hasTableLayout {
return nil
}
var bestElement *goquery.Selection
bestScore := 0.0
tables.Each(func(_ int, table *goquery.Selection) {
table.Find("td").Each(func(_ int, cell *goquery.Selection) {
score := scoring.ScoreElement(cell)
if score > bestScore {
bestScore = score
bestElement = cell
}
})
})
if bestScore > 50 {
return bestElement
}
return nil
}
// findContentByScoring finds content using scoring algorithm
// JavaScript original code:
//
// private findContentByScoring(doc: Document): Element | null {
// const candidates = doc.querySelectorAll('div, section, article, main');
// const elements = Array.from(candidates);
// return ContentScorer.findBestElement(elements, 50);
// }
func (d *Defuddle) findContentByScoring(doc *goquery.Document) *goquery.Selection {
var candidates []*goquery.Selection
doc.Find("div, section, article, main, aside, header, footer, nav, content").Each(func(_ int, s *goquery.Selection) {
candidates = append(candidates, s)
})
return scoring.FindBestElement(candidates, 50)
}