node.js爬虫练习 爬取数据存储到mongodb

youkumovie(优酷电影目录)

本人的node.js爬虫练习,大神无视,一直更新,爬虫爬到数据存储到数据库中。数据库为mongodb。总共30页,每3秒爬一次,到目前为止亲测没有被封ip,本人第一次写爬虫,大家多多包涵!

用到的库

1
2
3
4
"cheerio": "^1.0.0-rc.2",
"mongodb": "^2.2.33",
"request": "^2.83.0",
"request-promise": "^4.2.2"

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
let rp = require('request-promise');
let cheerio = require('cheerio');
let MongoClient = require('mongodb').MongoClient
, assert = require('assert');

// 测试爬虫
let index = 1;
function main(){
if (index > 30){
clearInterval(timer);
console.log("恭喜你,数据已经全部爬取完毕!")
return;
}
rp('http://list.youku.com/category/show/c_96_s_1_d_1_p_' + index + '.html')
.then(function (res) {
// Process html...
let $ = cheerio.load(res)
let data = $('.title a').toArray()
for (let i = 0; i < data.length; i++) {
let videotitle = $('.title a').eq(i).attr('title');
let videolink = 'http:' + $('.title a').eq(i).attr('href');
let videoimg = $('.p-thumb img').eq(i).attr('src');
let videostate = $('.status').eq(i).text();
let url = 'mongodb://localhost:27017/test';
MongoClient.connect(url, function(err, db) {
db.collection('video').insertMany([{videotitle:videotitle},{videolink:videolink},{videoimg:videoimg},{videostate:videostate}])
});
console.log(videotitle + '--数据已插入')
}
})
.catch(function (err) {
// Crawling failed...
console.log(err)
});
index++
}

var timer = setInterval(main, 3000);