# 爬虫

# 爬取百度搜图

TIP

下面这段代码爬取 htps://image.baidu.com 网站上指定关键字的搜索结果中的图片

const fs = require("fs");
const path = require("path");
const http = require("http");
const https = require("https");
const cliProgress = require("cli-progress");
const { red, green, blue, cyan, gray } = require("kolorist");
const puppeteer = require("puppeteer");

const rootPath = path.resolve(__dirname, "../baidus/");
const protocol = "https";
const hostMain = "image.baidu.com";
const url = `${protocol}://${hostMain}/`;

let words = [
  "计算机",
  "前端",
  "后端",
  "架构",
  "docker",
  "jenkins",
  "ubuntu",
  "linux",
  "vue",
  "react",
  "angular",
  "jquery",
  "layui",
  "mvc",
  "mvp",
  "mvvm",
];

words = shuffle(words);

(async () => {
  const browser = await puppeteer.launch({
    headless: true,
    defaultViewport: { width: 1920, height: 1080 },
    // args: ["--proxy-server=127.0.0.1:10809"],
  });
  const page = await browser.newPage();
  await page.goto(url);
  console.log(green("\r\n前往页面"), blue(url));
  let current = 0;
  const total = words.length;
  // 创建一个进度条
  const bar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
  bar.start(total, current);
  for (const keyword of words) {
    console.log(green("\r\n搜索关键词"), blue(keyword));
    // 焦点设置到搜索输入
    await page.waitForSelector("#kw");
    await page.focus("#kw");
    // 清空之前的输入 Ctrl + A 然后 Backspace
    await page.keyboard.down("Control");
    await page.keyboard.press("A");
    await sleep(100 + Math.floor(Math.random() * 200));
    await page.keyboard.up("Control");
    await page.keyboard.press("Backspace");
    // 输入关键词
    await page.keyboard.type(keyword);
    // 按下回车
    await page.keyboard.press("Enter");
    // 等待页面跳转
    await page.waitForNavigation();
    // 获取图片
    let images = await page.$$(".main_img");
    console.log(images.length);
    if (images.length == 0) console.log(red("失败", keyword));
    // 创建目录
    await mkdir(path.join(rootPath, `${keyword}`));
    await sleep(100 + Math.floor(Math.random() * 100));
    let index = 0;
    for (const image of images) {
      index++;
      let dir = path.join(rootPath, `${keyword}/${index}.jpg`);
      const src = await (await image.getProperty("src")).jsonValue();
      if (image.screenshot) {
        if (!fs.existsSync(dir)) {
          await downloadImg(src, dir);
          await sleep(100 + Math.floor(Math.random() * 100));
        }
      }
    }
    await sleep(100 + Math.floor(Math.random() * 1000));
    bar.update(++current);
    // 延迟
  }
  bar.stop();
  await browser.close();
})();

# 爬取古诗词文

TIP

下面这段代码爬取 so.gushiwen.cn 网站上指定诗人的第一页诗词的内容，并写入到 poems.json

const fs = require("fs");
const path = require("path");
const http = require("http");
const https = require("https");
const cliProgress = require("cli-progress");
const { red, green, blue, cyan, gray } = require("kolorist");
const puppeteer = require("puppeteer");
const { parse } = require("node-html-parser");

const rootPath = path.resolve(__dirname, "../gushicis/");
const baseUrl = `https://so.gushiwen.cn/shiwens/default.aspx`;

let poets = ["李白", "杜甫", "辛弃疾", "白居易", "苏轼", "王维", "杜牧", "陆游"];

poets = shuffle(poets);

(async () => {
  const browser = await puppeteer.launch({
    headless: true,
    defaultViewport: { width: 1920, height: 1080 },
    // args: ["--proxy-server=127.0.0.1:10809"],
  });
  const page = await browser.newPage();
  let current = 0;
  const total = poets.length;
  const bar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
  bar.start(total, current);
  //
  const result = [];
  for (const poet of poets) {

    const url = `${baseUrl}?astr=${poet}`;
    console.log(green("\r\n前往页面"), blue(url));
    await page.goto(url);
    await page.waitForSelector(".sons > .cont .contson");
    await sleep(100 + Math.floor(Math.random() * 100));
    let poems = await page.evaluate(async () => {
      let elements = document.querySelectorAll(".sons .cont");
      let result = [];
      for (let i = 0; i < elements.length; i++) {
        const content = elements[i].outerHTML;
        result.push(content);
      }
      return result;
    });

    if (poems.length === 0) console.log(red("失败", poets));
    let list = [];
    poems.forEach((html) => {
      const node = parse(html);
      if (node.querySelector(".contson")) {
        const title = node.querySelector("p a b").textContent;
        const author = poet;
        let content = node.querySelector(".contson").textContent;
        content = content.replace(/\(.*\)/g, "");
        content = content.replace(/\n\s*/g, "");
        const array = content.split(/。/).filter((item) => item.trim() != "");
        list.push({ title, author, content, array });
      }
    });
    result.push(...list);
    bar.update(++current);
  }
  // write
  fs.writeFileSync(path.join(rootPath, `poems.json`), JSON.stringify(result, null, 2));
  bar.stop();
  await browser.close();
})();

← 自动化任务

爬取百度搜图

爬取古诗词文