youkumovie(优酷电影目录)
本人的node.js爬虫练习,大神无视,一直更新,爬虫爬到数据存储到数据库中。数据库为mongodb。总共30页,每3秒爬一次,到目前为止亲测没有被封ip,本人第一次写爬虫,大家多多包涵!
用到的库
1 2 3 4
| "cheerio": "^1.0.0-rc.2", "mongodb": "^2.2.33", "request": "^2.83.0", "request-promise": "^4.2.2"
|
代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
| let rp = require('request-promise'); let cheerio = require('cheerio'); let MongoClient = require('mongodb').MongoClient , assert = require('assert');
// 测试爬虫 let index = 1; function main(){ if (index > 30){ clearInterval(timer); console.log("恭喜你,数据已经全部爬取完毕!") return; } rp('http://list.youku.com/category/show/c_96_s_1_d_1_p_' + index + '.html') .then(function (res) { // Process html... let $ = cheerio.load(res) let data = $('.title a').toArray() for (let i = 0; i < data.length; i++) { let videotitle = $('.title a').eq(i).attr('title'); let videolink = 'http:' + $('.title a').eq(i).attr('href'); let videoimg = $('.p-thumb img').eq(i).attr('src'); let videostate = $('.status').eq(i).text(); let url = 'mongodb://localhost:27017/test'; MongoClient.connect(url, function(err, db) { db.collection('video').insertMany([{videotitle:videotitle},{videolink:videolink},{videoimg:videoimg},{videostate:videostate}]) }); console.log(videotitle + '--数据已插入') } }) .catch(function (err) { // Crawling failed... console.log(err) }); index++ }
var timer = setInterval(main, 3000);
|