# 爬虫
# 爬取百度搜图
TIP
下面这段代码爬取 htps://image.baidu.com
网站上指定关键字的搜索结果中的图片
const fs = require("fs");
const path = require("path");
const http = require("http");
const https = require("https");
const cliProgress = require("cli-progress");
const { red, green, blue, cyan, gray } = require("kolorist");
const puppeteer = require("puppeteer");
const rootPath = path.resolve(__dirname, "../baidus/");
const protocol = "https";
const hostMain = "image.baidu.com";
const url = `${protocol}://${hostMain}/`;
let words = [
"计算机",
"前端",
"后端",
"架构",
"docker",
"jenkins",
"ubuntu",
"linux",
"vue",
"react",
"angular",
"jquery",
"layui",
"mvc",
"mvp",
"mvvm",
];
words = shuffle(words);
(async () => {
const browser = await puppeteer.launch({
headless: true,
defaultViewport: { width: 1920, height: 1080 },
// args: ["--proxy-server=127.0.0.1:10809"],
});
const page = await browser.newPage();
await page.goto(url);
console.log(green("\r\n前往页面"), blue(url));
let current = 0;
const total = words.length;
// 创建一个进度条
const bar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
bar.start(total, current);
for (const keyword of words) {
console.log(green("\r\n搜索关键词"), blue(keyword));
// 焦点设置到搜索输入
await page.waitForSelector("#kw");
await page.focus("#kw");
// 清空之前的输入 Ctrl + A 然后 Backspace
await page.keyboard.down("Control");
await page.keyboard.press("A");
await sleep(100 + Math.floor(Math.random() * 200));
await page.keyboard.up("Control");
await page.keyboard.press("Backspace");
// 输入关键词
await page.keyboard.type(keyword);
// 按下回车
await page.keyboard.press("Enter");
// 等待页面跳转
await page.waitForNavigation();
// 获取图片
let images = await page.$$(".main_img");
console.log(images.length);
if (images.length == 0) console.log(red("失败", keyword));
// 创建目录
await mkdir(path.join(rootPath, `${keyword}`));
await sleep(100 + Math.floor(Math.random() * 100));
let index = 0;
for (const image of images) {
index++;
let dir = path.join(rootPath, `${keyword}/${index}.jpg`);
const src = await (await image.getProperty("src")).jsonValue();
if (image.screenshot) {
if (!fs.existsSync(dir)) {
await downloadImg(src, dir);
await sleep(100 + Math.floor(Math.random() * 100));
}
}
}
await sleep(100 + Math.floor(Math.random() * 1000));
bar.update(++current);
// 延迟
}
bar.stop();
await browser.close();
})();
# 爬取古诗词文
TIP
下面这段代码爬取 so.gushiwen.cn 网站上指定诗人的第一页诗词的内容,并写入到 poems.json
const fs = require("fs");
const path = require("path");
const http = require("http");
const https = require("https");
const cliProgress = require("cli-progress");
const { red, green, blue, cyan, gray } = require("kolorist");
const puppeteer = require("puppeteer");
const { parse } = require("node-html-parser");
const rootPath = path.resolve(__dirname, "../gushicis/");
const baseUrl = `https://so.gushiwen.cn/shiwens/default.aspx`;
let poets = ["李白", "杜甫", "辛弃疾", "白居易", "苏轼", "王维", "杜牧", "陆游"];
poets = shuffle(poets);
(async () => {
const browser = await puppeteer.launch({
headless: true,
defaultViewport: { width: 1920, height: 1080 },
// args: ["--proxy-server=127.0.0.1:10809"],
});
const page = await browser.newPage();
let current = 0;
const total = poets.length;
const bar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
bar.start(total, current);
//
const result = [];
for (const poet of poets) {
const url = `${baseUrl}?astr=${poet}`;
console.log(green("\r\n前往页面"), blue(url));
await page.goto(url);
await page.waitForSelector(".sons > .cont .contson");
await sleep(100 + Math.floor(Math.random() * 100));
let poems = await page.evaluate(async () => {
let elements = document.querySelectorAll(".sons .cont");
let result = [];
for (let i = 0; i < elements.length; i++) {
const content = elements[i].outerHTML;
result.push(content);
}
return result;
});
if (poems.length === 0) console.log(red("失败", poets));
let list = [];
poems.forEach((html) => {
const node = parse(html);
if (node.querySelector(".contson")) {
const title = node.querySelector("p a b").textContent;
const author = poet;
let content = node.querySelector(".contson").textContent;
content = content.replace(/\(.*\)/g, "");
content = content.replace(/\n\s*/g, "");
const array = content.split(/。/).filter((item) => item.trim() != "");
list.push({ title, author, content, array });
}
});
result.push(...list);
bar.update(++current);
}
// write
fs.writeFileSync(path.join(rootPath, `poems.json`), JSON.stringify(result, null, 2));
bar.stop();
await browser.close();
})();