freeCodeCamp 成都社区 在线工作坊 #2
2019 年 4 月 14 日(周日)晚 8 ~ 10 点
加 QQ 群 (466129470) 上课(入群注明课程名)
学习收获
一小时内学会用 Node.JS 从多个网站汇总最新本地 IT 活动列表,并了解一些知识点 ——
- 用 Chrome 调试器分析 HTML 结构
- 用 Chrome 调试器分析 HTTP 接口
- Puppeteer 无界面浏览器操作
- JavaScript 最新标准语法
内容大纲
- JavaScript 标准项目生成
- 静态网页抓取
- 动态网页抓取
- 数据接口分析
- 数据分页处理
课前准备
请学员务必提前执行以下命令,安装好开发环境!(操作图解)
Windows
choco install -y git tortoisegit nodejs-lts vscode googlechrome zoom
Mac OS X
brew install node cask
brew cask install sourcetree visual-studio-code google-chrome zoomus
操作要点
JavaScript 标准项目生成
npm init es-pack ~/Desktop/web-crawler
code ~/Desktop/web-crawler
为 Node.JS 定制配置
npm uninstall amd-bundle
npm install -D \
@babel/cli \
@babel/core \
@babel/plugin-transform-runtime
npm install @babel/runtime
package.json
{
"engines": {
"node": "^6.13.0"
},
"script": {
"lint": "eslint source/ --fix",
"pack": "babel source/ -d dist/ -s"
},
"babel": {
"presets": [
[
"@babel/preset-env",
{
"targets": {
"node": "6.13.0"
}
}
]
],
"plugins": ["@babel/plugin-transform-runtime"]
}
}
静态网页抓取
安装依赖包
npm install jsdom
核心代码
source/static.js
#! /usr/bin/env node
import "@babel/polyfill";
import { JSDOM } from "jsdom";
(async () => {
const {
window: { document }
} = await JSDOM.fromURL("https://segmentfault.com/events?city=510100");
const list = [
...document.querySelectorAll(".all-event-list .widget-event")
].map(item => ({
title: item.querySelector(".title").textContent.trim(),
date: item
.querySelector(".widget-event__meta :first-child")
.textContent.trim()
.slice(3),
address: item
.querySelector(".widget-event__meta :last-child")
.textContent.trim()
.slice(3),
banner: item.querySelector(".widget-event__banner").dataset.original
}));
console.info(list);
})();
编译并运行
npm run build
node dist/static
动态网页抓取
安装依赖包
npm install puppeteer-core @tech_query/node-toolkit
npm install fs-match -D
增加项目配置
package.json
{
"scripts": {
"install": "app-find chrome -c"
}
}
首次安装需手动应用配置:
npm run install
核心代码
source/dynamic.js
#! /usr/bin/env node
import "@babel/polyfill";
import Puppeteer from "puppeteer-core";
import { getNPMConfig } from "@tech_query/node-toolkit";
(async () => {
const browser = await Puppeteer.launch({
executablePath: getNPMConfig("chrome")
});
const [page] = await browser.pages();
await page.goto("https://juejin.im/events/chengdu");
await page.waitFor(".events-list .events-inner");
const list = await page.$$eval(".events-list .events-inner", list =>
list.map(item => ({
title: item.querySelector(".title").textContent.trim(),
date: item.querySelector(".date").textContent.trim(),
address: item.querySelector(".address").textContent.trim(),
banner: (item
.querySelector(".banner")
.style.backgroundImage.match(/url\((?:'|")?(.+)(?:'|")?\)/) || "")[1]
}))
);
console.info(list);
process.exit();
})();
编译并运行
npm run build
node dist/dynamic
数据接口分析
安装依赖包
npm install node-fetch
核心代码
source/data.js
#! /usr/bin/env node
import "@babel/polyfill";
import { URLSearchParams } from "url";
import fetch from "node-fetch";
(async () => {
const response = await fetch(
`https://event-storage-api-ms.juejin.im/v2/getEventList?${new URLSearchParams(
{
src: "web",
orderType: "startTime",
cityAlias: "chengdu"
}
)}`
);
const data = await response.json();
console.info(data.d);
})();
编译并运行
npm run build
node dist/data
数据分页处理
传统思路
source/data.js
import { URLSearchParams } from "url";
import fetch from "node-fetch";
export default async function (index = 1) {
const URL = `https://event-storage-api-ms.juejin.im/v2/getEventList?${new URLSearchParams(
{
src: "web",
orderType: "startTime",
cityAlias: "chengdu",
pageNum: index
}
)}`;
console.warn(URL);
return (await (await fetch(URL)).json()).d;
}
source/index.js
#! /usr/bin/env node
import "@babel/polyfill";
import crawler from "./data";
(async () => {
const list = [];
for (let i = 0; ; )
try {
const page = await crawler(++i);
if ((page || "")[0]) list.push(...page);
else break;
} catch (error) {
console.warn(error);
break;
}
console.info(JSON.stringify(list, null, 4));
})();
通过命令输出数据到文件 ——
npm run build
node dist/ 1> index.json
现代思路
source/data.js
export default async function* (start = 1) {
while (true) {
const URL = `https://event-storage-api-ms.juejin.im/v2/getEventList?${new URLSearchParams(
{
src: "web",
orderType: "startTime",
cityAlias: "chengdu",
pageNum: start++
}
)}`;
const data = (await (await fetch(URL)).json()).d;
if ((data || "")[0]) yield { URL, data };
else break;
}
}
source/index.js
#! /usr/bin/env node
import "@babel/polyfill";
import crawler from "./data";
(async () => {
const list = [];
for await (let { URL, data } of crawler()) {
console.warn(URL);
list.push(...data);
}
console.info(JSON.stringify(list, null, 4));
})();
Think more…
- 多源数据去重
- 定期抓取
- 展示界面
- 本地应用打包
- 服务器部署
【附】样本数据
https://www.bagevent.com/eventlist.html?f=1&tag=17&city=%E6%88%90%E9%83%BD
https://www.oschina.net/event?tab=latest&city=%E6%88%90%E9%83%BD&time=all