NodeJS 网页爬虫一小时实战

freeCodeCamp 成都社区 在线工作坊 #2

2019 年 4 月 14 日(周日)晚 8 ~ 10 点

QQ 群 (466129470) 上课(入群注明课程名)

学习收获

一小时内学会用 Node.JS 从多个网站汇总最新本地 IT 活动列表,并了解一些知识点 ——

  1. Chrome 调试器分析 HTML 结构
  2. 用 Chrome 调试器分析 HTTP 接口
  3. Puppeteer 无界面浏览器操作
  4. JavaScript 最新标准语法

内容大纲

  • JavaScript 标准项目生成
  • 静态网页抓取
  • 动态网页抓取
  • 数据接口分析
  • 数据分页处理

课前准备

请学员务必提前执行以下命令,安装好开发环境!(操作图解

Windows

choco install -y git tortoisegit nodejs-lts vscode googlechrome zoom

Mac OS X

brew install node cask
brew cask install sourcetree visual-studio-code google-chrome zoomus

操作要点

JavaScript 标准项目生成

npm init es-pack ~/Desktop/web-crawler

code ~/Desktop/web-crawler

为 Node.JS 定制配置

npm uninstall amd-bundle

npm install -D \
    @babel/cli \
    @babel/core \
    @babel/plugin-transform-runtime

npm install @babel/runtime

package.json

{
  "engines": {
    "node": "^6.13.0"
  },
  "script": {
    "lint": "eslint source/ --fix",
    "pack": "babel source/ -d dist/ -s"
  },
  "babel": {
    "presets": [
      [
        "@babel/preset-env",
        {
          "targets": {
            "node": "6.13.0"
          }
        }
      ]
    ],
    "plugins": ["@babel/plugin-transform-runtime"]
  }
}

静态网页抓取

安装依赖包

npm install jsdom

核心代码

source/static.js

#! /usr/bin/env node

import "@babel/polyfill";

import { JSDOM } from "jsdom";

(async () => {
  const {
    window: { document }
  } = await JSDOM.fromURL("https://segmentfault.com/events?city=510100");

  const list = [
    ...document.querySelectorAll(".all-event-list .widget-event")
  ].map(item => ({
    title: item.querySelector(".title").textContent.trim(),
    date: item
      .querySelector(".widget-event__meta :first-child")
      .textContent.trim()
      .slice(3),
    address: item
      .querySelector(".widget-event__meta :last-child")
      .textContent.trim()
      .slice(3),
    banner: item.querySelector(".widget-event__banner").dataset.original
  }));

  console.info(list);
})();

编译并运行

npm run build

node dist/static

动态网页抓取

安装依赖包

npm install puppeteer-core @tech_query/node-toolkit

npm install fs-match -D

增加项目配置

package.json

{
  "scripts": {
    "install": "app-find chrome -c"
  }
}

首次安装需手动应用配置:

npm run install

核心代码

source/dynamic.js

#! /usr/bin/env node

import "@babel/polyfill";

import Puppeteer from "puppeteer-core";

import { getNPMConfig } from "@tech_query/node-toolkit";

(async () => {
  const browser = await Puppeteer.launch({
    executablePath: getNPMConfig("chrome")
  });

  const [page] = await browser.pages();

  await page.goto("https://juejin.im/events/chengdu");

  await page.waitFor(".events-list .events-inner");

  const list = await page.$$eval(".events-list .events-inner", list =>
    list.map(item => ({
      title: item.querySelector(".title").textContent.trim(),
      date: item.querySelector(".date").textContent.trim(),
      address: item.querySelector(".address").textContent.trim(),
      banner: (item
        .querySelector(".banner")
        .style.backgroundImage.match(/url\((?:'|")?(.+)(?:'|")?\)/) || "")[1]
    }))
  );

  console.info(list);

  process.exit();
})();

编译并运行

npm run build

node dist/dynamic

数据接口分析

安装依赖包

npm install node-fetch

核心代码

source/data.js

#! /usr/bin/env node

import "@babel/polyfill";

import { URLSearchParams } from "url";

import fetch from "node-fetch";

(async () => {
  const response = await fetch(
    `https://event-storage-api-ms.juejin.im/v2/getEventList?${new URLSearchParams(
      {
        src: "web",
        orderType: "startTime",
        cityAlias: "chengdu"
      }
    )}`
  );

  const data = await response.json();

  console.info(data.d);
})();

编译并运行

npm run build

node dist/data

数据分页处理

传统思路

source/data.js

import { URLSearchParams } from "url";

import fetch from "node-fetch";

export default async function (index = 1) {
  const URL = `https://event-storage-api-ms.juejin.im/v2/getEventList?${new URLSearchParams(
    {
      src: "web",
      orderType: "startTime",
      cityAlias: "chengdu",
      pageNum: index
    }
  )}`;

  console.warn(URL);

  return (await (await fetch(URL)).json()).d;
}

source/index.js

#! /usr/bin/env node

import "@babel/polyfill";

import crawler from "./data";

(async () => {
  const list = [];

  for (let i = 0; ; )
    try {
      const page = await crawler(++i);

      if ((page || "")[0]) list.push(...page);
      else break;
    } catch (error) {
      console.warn(error);
      break;
    }

  console.info(JSON.stringify(list, null, 4));
})();

通过命令输出数据到文件 ——

npm run build

node dist/ 1> index.json

现代思路

source/data.js

export default async function* (start = 1) {
  while (true) {
    const URL = `https://event-storage-api-ms.juejin.im/v2/getEventList?${new URLSearchParams(
      {
        src: "web",
        orderType: "startTime",
        cityAlias: "chengdu",
        pageNum: start++
      }
    )}`;

    const data = (await (await fetch(URL)).json()).d;

    if ((data || "")[0]) yield { URL, data };
    else break;
  }
}

source/index.js

#! /usr/bin/env node

import "@babel/polyfill";

import crawler from "./data";

(async () => {
  const list = [];

  for await (let { URL, data } of crawler()) {
    console.warn(URL);

    list.push(...data);
  }

  console.info(JSON.stringify(list, null, 4));
})();

Think more…

  1. 多源数据去重
  2. 定期抓取
  3. 展示界面
  4. 本地应用打包
  5. 服务器部署

【附】样本数据

  1. https://www.huodongxing.com/events?orderby=n&tag=IT%E4%BA%92%E8%81%94%E7%BD%91&city=%E6%88%90%E9%83%BD

  2. https://www.bagevent.com/eventlist.html?f=1&tag=17&city=%E6%88%90%E9%83%BD

  3. https://www.oschina.net/event?tab=latest&city=%E6%88%90%E9%83%BD&time=all

  4. https://juejin.im/events/chengdu

  5. https://segmentfault.com/events?city=510100

【附】参考文档


评论

Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

×