-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.js
41 lines (38 loc) · 1.08 KB
/
crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
const _ = require("lodash");
const request = require("request-promise");
const URL = require("url");
async function crawlSite(website) {
if (!website) {
throw Error("Must provide website - e.g. node crawler.js [website]");
}
let result = [website];
let url = URL.parse(website);
request(website).then(html => {
// console.log(html);
let anchorTags = html.match(/href.*?</gi);
console.log("> Anchors: ", anchorTags);
let nextLink = _.find(anchorTags, tag => {
// return /Next </.test(tag);
// return /→/.test(tag);
// return />>/.test(tag);
return /next/.test(tag);
})
.match(/(?:"[^"]*"|^[^"]*$)/)[0]
.replace(/"/g, "");
// console.log('> New link: ', nextLink, url.hostname + nextLink);
// let newUrl = 'https://' + url.hostname + nextLink;
let newUrl =
website
.split("/")
.reverse()
.slice(1)
.reverse()
.join("/") +
"/" +
nextLink;
console.log('"' + newUrl + '",');
crawlSite(newUrl);
return html;
});
}
crawlSite(process.argv[2]);