Today, I'm do simple example "Crawl data web using Nodejs". I want call to URL, after I download all image in URL save to directory
I using library below
npm install request-promise cheerio request fs
- request-promise : using Call to URL
- cheerio : Dom element html, we can use jquery get value (id,class)
- fs : we can read file in directory
Okay, after we setup project success, we need create file and foler following below
- create dslink.txt file in directory, crawl list url
- create images folder in directory , using save image
- create data.json, save info crawl url
Okay, create index.js file
const rp = require("request-promise"); const cheerio = require("cheerio"); const request = require('request'); const fs = require("fs"); function sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } const dslink = "dslink.txt"; var arrayLink = fs.readFileSync(dslink).toString().split("\n"); async function crawler() { await sleep(1000); for (i in arrayLink) { const linkchay = arrayLink[i]; try { const options = { uri: linkchay, transform: function (body) { return cheerio.load(body); }, }; var $ = await rp(options); } catch (error) { console.log("Link dang dung:" + arrayLink[i]); return error; } const title = $(".ten_title").text().trim(); const tableContent = $(".info_content"); let data = []; // Tên của chương đó. let chaperTitle = tableContent.find("p").text().trim(); let namefile = ""; let chaperData = [] const chaperLink = tableContent.find("p").find("img"); for (let j = 0; j < chaperLink.length; j++) { const post = $(chaperLink[j]); const postLink = post.attr("src"); const n = postLink.lastIndexOf("/"); const filename = postLink.substring(n + 1, postLink.length); namefile = filename; download(postLink, filename, function () { //console.log("Link:"+linkchay); }); const postTitle = post.text().trim(); chaperData.push({ postTitle, linkchay, filename, }); } data.push({ chaperTitle, chaperData, }); fs.writeFileSync('data.json', JSON.stringify(data)) console.log(linkchay + "------------->done"); await sleep(1000); } }; //call crawler crawler(); //call download file var download = function (uri, filename, callback) { request.head(uri, function (err, res, body) { console.log('content-type:', res.headers['content-type']); console.log('content-length:', res.headers['content-length']); request(uri).pipe(fs.createWriteStream('./images/' + filename)).on('close', callback); }); };
The post: Crawl Data Website Using Nodejs
Top comments (0)