-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.php
More file actions
74 lines (64 loc) · 2.39 KB
/
scrape.php
File metadata and controls
74 lines (64 loc) · 2.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
<?php
/*
* PHP Bruteforce ID downloader & web scraping script
* @Author Luka Pušić luka@pusic.si
*
* Don't forget to change the $url below!
*/
//REQUIRED
$start = 1;
$stop = 10000;
//OPTIONAL
$logfile = time() . '.txt';
$scrapedir = 'scrape';
//initiate logfile start, mk scrapedir
$log = fopen($logfile, 'w+') or die("can't open log");
fwrite($log, date('d.m.Y H:i') . "\tSTARTED at $start\n");
@mkdir($scrapedir);
for ($n = $start; $n < $stop; $n++) {
//REQUIRED
$url = "http://www.example.com/download.php?id=$n";
//look for filename, if found, download
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_USERAGENT, 'Googlebot/2.1 (+http://www.google.com/bot.html)');
curl_setopt($ch, CURLOPT_HEADER, true); // header will be at output
curl_setopt($ch, CURLOPT_NOBODY, true);
#curl_setopt($ch, CURLOPT_PROXY, "localhost:9050");
#curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5);
$header = curl_exec($ch);
curl_close($ch);
//validate and sanitize filename, if not found, continue loop
if (preg_match('/content-disposition: .*filename=([^\n]+)/i', $header, $matches)) {
//remove all non alnum chars
$filename = trim(preg_replace("/[^a-zA-Z0-9.\-_]/", "", $matches[1]));
} else {
echo "$n\tskipping...\n";
fwrite($log, date('d.m.Y H:i') . "\t$n\tskipping...\n");
continue;
}
//initiate file download
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_USERAGENT, 'Googlebot/2.1 (+http://www.google.com/bot.html)');
curl_setopt($ch, CURLOPT_REFERER, 'http://www.google.com');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
#curl_setopt($ch, CURLOPT_PROXY, "localhost:9050");
#curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5);
$contents = curl_exec($ch);
curl_close($ch);
//stdout info
echo "$n\t$filename\n";
//write file contents
$fh = fopen($scrapedir . '/' . $filename, 'w') or die("can't open file");
fwrite($fh, $contents);
fclose($fh);
//write to log
fwrite($log, date('d.m.Y H:i') . "\t$n\t$filename\n");
}
//write end time to log & close
fwrite($log, date('d.m.Y H:i') . "\tSTOPPED\n");
fclose($log);
?>