-
Notifications
You must be signed in to change notification settings - Fork 72
Expand file tree
/
Copy pathtelegram-parser.js
More file actions
328 lines (284 loc) · 10.6 KB
/
telegram-parser.js
File metadata and controls
328 lines (284 loc) · 10.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
const cheerio = require('cheerio');
class TelegramParser {
constructor() {
this.emojiMap = {
'🔗': 'link',
'📽': 'video',
'💡': 'idea',
'©': 'copyright',
'❤': 'heart',
'👍': 'like',
'👎': 'dislike',
'😁': 'happy',
'😢': 'sad',
'🤬': 'angry',
'😐': 'neutral',
'👏': 'clap',
'🔥': 'fire',
'🥴': 'drunk',
'💯': '100',
'🗿': 'moyai',
'🎉': 'party',
'😱': 'scream',
'🏆': 'trophy'
};
}
parseChannelInfo(html) {
const $ = cheerio.load(html);
const channelInfo = {
title: '',
username: '',
description: '',
photo: '',
subscribers: '',
stats: {
photos: 0,
videos: 0,
files: 0,
links: 0
}
};
// Extract channel title
channelInfo.title = $('.tgme_channel_info_header_title span').text().trim();
// Extract username
channelInfo.username = $('.tgme_channel_info_header_username a').text().trim();
// Extract description
channelInfo.description = $('.tgme_channel_info_description').html() || '';
// Extract photo - try multiple selectors
let photoImg = $('.tgme_channel_info_header i img').first();
if (photoImg.length === 0) {
// Try the new structure
photoImg = $('.tgme_channel_info_header .tgme_page_photo_image img').first();
}
if (photoImg.length === 0) {
// Try any img inside the header
photoImg = $('.tgme_channel_info_header img').first();
}
if (photoImg.length > 0) {
channelInfo.photo = photoImg.attr('src') || '';
}
// Extract subscribers count
const subscribersText = $('.tgme_channel_info_counter').first().text().trim();
if (subscribersText) {
channelInfo.subscribers = subscribersText;
}
// Extract stats
$('.tgme_channel_info_counter').each((i, elem) => {
const text = $(elem).text().trim();
const value = $(elem).find('.counter_value').text().trim();
const type = $(elem).find('.counter_type').text().trim();
if (type === 'photos') channelInfo.stats.photos = value;
else if (type === 'videos') channelInfo.stats.videos = value;
else if (type === 'files') channelInfo.stats.files = value;
else if (type === 'links') channelInfo.stats.links = value;
});
return channelInfo;
}
parsePosts(html) {
const $ = cheerio.load(html);
const posts = [];
$('.tgme_widget_message_wrap').each((i, elem) => {
const post = this.parseSinglePost($, $(elem));
if (post) {
posts.push(post);
}
});
return posts;
}
parseSinglePost($, postElement) {
const post = {
id: '',
author: '',
authorPhoto: '',
text: '',
media: [],
reactions: [],
views: '',
time: '',
link: '',
edited: false
};
// Extract post ID and link
const postAttr = postElement.attr('data-post');
if (postAttr) {
// Extract ID from format "username/postID"
const parts = postAttr.split('/');
post.id = parts.length > 1 ? parts[1] : postAttr;
post.link = `https://t.me/${postAttr}`;
} else {
// Try to find ID from link elements - look for post-specific links
const linkElement = postElement.find('a[href*="/"]');
let foundId = false;
linkElement.each((i, elem) => {
if (foundId) return;
const href = $(elem).attr('href');
// Look for pattern like /username/1234 or /1234
const match = href?.match(/\/(\d+)(?:\/|$)/);
if (match && match[1]) {
post.id = match[1];
post.link = href;
foundId = true;
}
});
}
// Extract author info
post.author = postElement.find('.tgme_widget_message_owner_name span').text().trim();
// Extract author photo
const authorImg = postElement.find('.tgme_widget_message_user_photo img');
if (authorImg.length > 0) {
post.authorPhoto = authorImg.attr('src') || '';
}
// Extract text content
const textElement = postElement.find('.tgme_widget_message_text');
if (textElement.length > 0) {
post.text = this.parseTextContent($, textElement);
}
// Extract media (photos, videos)
post.media = this.parseMedia($, postElement);
// Extract reactions
post.reactions = this.parseReactions($, postElement);
// Extract views
const viewsElement = postElement.find('.tgme_widget_message_views');
if (viewsElement.length > 0) {
post.views = viewsElement.text().trim();
}
// Extract time
const timeElement = postElement.find('.tgme_widget_message_date time');
if (timeElement.length > 0) {
post.time = timeElement.attr('datetime') || timeElement.text().trim();
}
// Check if edited
const metaElement = postElement.find('.tgme_widget_message_meta');
if (metaElement.length > 0 && metaElement.text().includes('edited')) {
post.edited = true;
}
return post;
}
parseTextContent($, textElement) {
let content = '';
textElement.contents().each((i, elem) => {
const $elem = $(elem);
if (elem.type === 'text') {
content += $elem.text();
} else if (elem.tagName === 'a') {
const href = $elem.attr('href') || '';
const text = $elem.text().trim();
// Check if this is a hashtag link (starts with #)
if (text.startsWith('#')) {
// Convert hashtag to plain text (no link)
content += text;
} else {
// Keep regular links as clickable
content += `<a href="${href}" target="_blank">${text}</a>`;
}
} else if (elem.tagName === 'br') {
content += '<br>';
} else if (elem.tagName === 'code') {
const codeText = $elem.text().trim();
content += `<code>${codeText}</code>`;
} else if (elem.tagName === 'pre') {
const preText = $elem.text().trim();
content += `<pre>${preText}</pre>`;
} else if ($elem.hasClass('emoji')) {
const emojiText = $elem.find('b').text().trim();
content += emojiText;
}
});
return content.trim();
}
parseMedia($, postElement) {
const media = [];
// Parse photos from HTML (for light version)
postElement.find('.tgme_widget_message_photo_wrap').each((i, elem) => {
const $elem = $(elem);
const photo = {
type: 'photo',
url: $elem.attr('href') || '',
thumb:
$elem.css('background-image')?.replace(/url\(['"]?([^'"]*)['"]?\)/, '$1') || '',
width: $elem.css('width')?.replace('px', '') || '',
paddingTop: $elem.find('.tgme_widget_message_photo').css('padding-top') || ''
};
media.push(photo);
});
// Parse videos
postElement.find('.tgme_widget_message_video_player').each((i, elem) => {
const $elem = $(elem);
const video = {
type: 'video',
url: $elem.attr('href') || '',
thumb:
$elem
.find('.tgme_widget_message_video_thumb')
.css('background-image')
?.replace(/url\(['"]?([^'"]*)['"]?\)/, '$1') || '',
duration: $elem.find('.message_video_duration').text().trim() || '',
width:
$elem.find('.tgme_widget_message_video_wrap').css('width')?.replace('px', '') ||
'',
paddingTop: $elem.find('.tgme_widget_message_video_wrap').css('padding-top') || ''
};
media.push(video);
});
return media;
}
parseReactions($, postElement) {
const reactions = [];
postElement.find('.tgme_reaction').each((i, elem) => {
const $elem = $(elem);
const emoji = $elem.find('.emoji b, .icon').text().trim();
const count = $elem
.contents()
.filter(function () {
return this.type === 'text';
})
.text()
.trim();
let type = 'custom';
if (this.emojiMap[emoji]) {
type = this.emojiMap[emoji];
}
reactions.push({
emoji: emoji,
count: count || '0',
type: type
});
});
return reactions;
}
parseFullPage(html) {
const $ = cheerio.load(html);
return {
channel: this.parseChannelInfo(html),
posts: this.parsePosts(html),
totalPosts: $('.tgme_widget_message_wrap').length
};
}
// Parse posts from JSON data (for normal version with base64 images)
parsePostsFromJson(posts) {
return posts.map((post) => {
// Parse media from JSON data
if (post.media && post.media.length > 0) {
post.media = post.media.map((media) => {
if (
media.type === 'photo' &&
media.url &&
media.url.startsWith('data:image/')
) {
// Base64 image from JSON
return {
type: 'photo',
url: media.url,
thumb: media.url,
width: media.width || '',
paddingTop: media.height || ''
};
}
return media;
});
}
return post;
});
}
}
module.exports = TelegramParser;