Commit c804c41e authored by coolfish's avatar coolfish

Initial commit

parents
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
# nyc test coverage
.nyc_output
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (http://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# Typescript v1 declaration files
typings/
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
doc
example
MIT License
Copyright (c) 2017 coolfish
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
\ No newline at end of file
# 页面爬虫解析器核心
此工具适用于
1. 对单独页面链接进行解析
2. 配合队列进行多页面解析
## 解释说明
支持详情页下一页抓取,支持繁体转换,支持对字数统计,支持对图片数量统计。
目前主要针对静态页的解析,对json请求和jsonp请求的解析做了预留(暂不支持)。
[![NPM](https://nodei.co/npm/almighty-parser-core.png?downloads=true&downloadRank=true&stars=true)](https://nodei.co/npm/almighty-parser-core/)
[![npm](https://img.shields.io/npm/v/almighty-parser-core.svg)]()
[![npm](https://img.shields.io/npm/dm/almighty-parser-core.svg)]()
[![license](https://img.shields.io/github/license/coolfishstudio/almighty-parser-core.svg)]()
## 安装
```
npm i --save almighty-parser-core
```
## api接口
- [x] `getLinks` 获取待抓页链接
- [x] `getContent` 获取详情页内容
- [x] `parse` 解析获取内容[为`getLinks`与`getContent`的集合]
- [x] `isArticleUrl` 检测链接是否是详情页
- [x] `isListUrl` 检测链接是否是列表页
- [x] `getIdFromArticleUrl` 获取页面链接的唯一标示
## 配置参数
[文档说明](https://github.com/coolfishstudio/almighty-parser-core/blob/master/doc/CONFIG.md)
## 实例
### 解析器案例
[糗事百科 - 基础](https://github.com/coolfishstudio/almighty-parser-core/blob/master/example/parser/parser-qiushibaike.js)
[今日健康 - 繁体](https://github.com/coolfishstudio/almighty-parser-core/blob/master/example/parser/parser-healthno1.js)
[爆料网 - 详情下一页](https://github.com/coolfishstudio/almighty-parser-core/blob/master/example/parser/parser-baoliao5.js)
### 定义网站规则
```
module.exports = {
// 域名 网站域名,设置域名后只处理这些域名下的网页
domains: 'https://www.qiushibaike.com/',
// 列表页url的正则,符合这些正则的页面会被当作列表页处理
listUrlRegexes: [/^https:\/\/www\.qiushibaike\.com(\/[a-z0-9]+(\/page\/[0-9]+)?)?(\/)?$/],
// 内容页url的正则,符合这些正则的页面会被当作内容页处理
contentUrlRegexes: [/^https:\/\/www\.qiushibaike\.com\/article\/[0-9]+$/],
// 从内容页中抽取需要的数据
fields: [{
// 作者
name: 'author',
meta: {
selector: ['.author h2'],
format: 'text'
}
}, {
// 标签
name: 'tags',
meta: {
format: 'text',
selector: ['.source a'],
index: 0
}
}, {
// 网页关键字
name: 'keywords',
meta: {
format: 'meta',
selector: ['meta[name="keywords"]']
}
}, {
// 网页描述
name: 'description',
meta: {
format: 'meta',
selector: ['meta[name="description"]']
}
}, {
// 详情
name: 'content',
meta: {
selector: ['.content', '.thumb'],
format: 'html'
},
required: true
}, {
name: 'imagesCount',
meta: {
selector: ['.thumb'],
format: 'count',
countType: 'image'
},
defaultValue: 0
}, {
name: 'wordsCount',
meta: {
selector: ['.content'],
format: 'count',
countType: 'text'
},
defaultValue: 0
}, {
name: 'comments',
meta: {
selector: ['.stats-comments .number'],
format: 'text'
},
defaultValue: 0
}, {
name: 'likes',
meta: {
selector: ['.stats-vote .number'],
format: 'text'
},
defaultValue: 0
}],
// 是否模拟用户请求
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
// 编码 默认utf-8
charset: null,
// 回调函数 对所有数据做处理
afterExtractAll: function (data) {
data.fields['hits'] = 0;
return data;
},
afterExtractField: function (fieldsName, data) {
if (fieldsName === 'tags') {
data = data ? data.split(',') : [];
}
if (fieldsName === 'comments') {
data = +data;
}
if (fieldsName === 'likes') {
data = +data;
}
return data;
}
};
```
### 引入
```
const Crawler = require('almighty-parser-core')
const options = require('../parser/parser-qiushibaike.js')
const parser = new Crawler(options)
```
### API测试
[测试案例](https://github.com/coolfishstudio/almighty-parser-core/blob/master/example/test/qiushibaike.js)
#### parse
```
{ fields:
{ author: '草莓、牛奶巧克力',
tags: [ '搞笑图片' ],
keywords: '',
description: '笑死我了',
content: '<div class="content">\n\n笑死我了\n\n</div><div class="thumb">\n\n<img src="//pic.qiushibaike.com/system/pictures/11909/119095438/medium/app119095438.jpg" alt="糗事#119095438">\n\n</div>',
imagesCount: 1,
wordsCount: 4,
comments: 0,
likes: 457,
from: 'https://www.qiushibaike.com/article/119095438',
sourceId: 'com.qiushibaike.www-article-119095438',
site: 'www.qiushibaike.com',
hits: 0 },
urls:
[ 'https://www.qiushibaike.com/',
'https://www.qiushibaike.com/hot/',
'https://www.qiushibaike.com/imgrank/',
'https://www.qiushibaike.com/text/',
'https://www.qiushibaike.com/history/',
'https://www.qiushibaike.com/pic/',
'https://www.qiushibaike.com/textnew/',
'https://www.qiushibaike.com/my',
'https://www.qiushibaike.com/article/116423562',
'https://www.qiushibaike.com/article/116424718',
'https://www.qiushibaike.com/article/116421669',
'https://www.qiushibaike.com/article/116423344',
'https://www.qiushibaike.com/article/116426229',
'https://www.qiushibaike.com/article/116423107',
'https://www.qiushibaike.com/article/104614784',
'https://www.qiushibaike.com/article/104590828',
'https://www.qiushibaike.com/article/104629666',
'https://www.qiushibaike.com/article/104599846',
'https://www.qiushibaike.com/article/104598154',
'https://www.qiushibaike.com/article/104619022',
'https://www.qiushibaike.com/article/118954381',
'https://www.qiushibaike.com/article/118491926',
'https://www.qiushibaike.com/article/118563113',
'https://www.qiushibaike.com/article/118806836',
'https://www.qiushibaike.com/article/118525804',
'https://www.qiushibaike.com/article/118770803',
'https://www.qiushibaike.com/article/119008939',
'https://www.qiushibaike.com/article/119033005',
'https://www.qiushibaike.com/article/119036209',
'https://www.qiushibaike.com/article/118922421',
'https://www.qiushibaike.com/article/119014594',
'https://www.qiushibaike.com/article/119009873',
'https://www.qiushibaike.com/article/118934286',
'https://www.qiushibaike.com/joke/',
'https://www.qiushibaike.com/article/' ] }
```
其余接口测试请下载后运行
```
npm run test:qiushibaike
```
## License
[MIT License](https://opensource.org/licenses/MIT)
## API 文档
- [x] `getLinks` 获取待抓页链接
- [x] `getContent` 获取详情页内容
- [x] `parse` 解析获取内容[为`getLinks`与`getContent`的集合]
- [x] `isArticleUrl` 检测链接是否是详情页
- [x] `isListUrl` 检测链接是否是列表页
- [x] `getIdFromArticleUrl` 获取页面链接的唯一标示
## 配置参数
针对不同 要有自己定义的配置
注意 目前只支持html静态页的内容抓取
配置 | 描述 | 是否必填 | 类型
------------- | ------------- | ------------- | -------------
domains | 网站域名 | 必填 | 字符串
listUrlRegexes | 列表页url的正则,符合这些正则的页面会被当作列表页处理 | 必填 | 数组
contentUrlRegexes | 内容页url的正则,符合这些正则的页面会被当作内容页处理 | 必填 | 数组
fields | 从内容页中抽取需要的数据 | 必填 | fields示例
userAgent | 是否模拟用户请求 | 选填 | 字符串
charset | 编码 默认utf-8 | 选填 | 字符串
afterExtractField | 回调函数 对每一个抽取出来的数据进行处理 | 选填 | 方法
afterExtractAll | 回调函数 对所有抽取出来的数据进行处理 | 选填 | 方法
contentPage | 对详情页下一页内容处理 | 选填 | contentPage示例
## fields示例
字段 | 描述 | 类型
------------- | ------------- | -------------
name | 定义字段名字 | 字符串 必填
meta | 选择器 | meta示例 必填
defaultValue | 默认值 | 任意 选填
### meta示例
字段 | 描述 | 类型
selector | 选择器(支持多个拼接) | 数组 必填
format | 返回是否含有标签[text/html/meta 默认text] | 字符串 选填
index | 下标 | 数字 选填
## contentPage示例
字段 | 描述 | 类型
------------- | ------------- | -------------
urls | 下一页的正则 | 数组 必填
selector | 选择器 | 数组 必填
appendNode | 插入的位置 | 任意 必填
\ No newline at end of file
## 实例
### 定义网站规则
```
module.exports = {
// 域名 网站域名,设置域名后只处理这些域名下的网页
domains: 'https://www.qiushibaike.com/',
// 列表页url的正则,符合这些正则的页面会被当作列表页处理
listUrlRegexes: [/^https:\/\/www\.qiushibaike\.com(\/[a-z0-9]+(\/page\/[0-9]+)?)?(\/)?$/],
// 内容页url的正则,符合这些正则的页面会被当作内容页处理
contentUrlRegexes: [/^https:\/\/www\.qiushibaike\.com\/article\/[0-9]+$/],
// 从内容页中抽取需要的数据
fields: [{
// 作者
name: 'author',
meta: {
selector: ['.author h2'],
format: 'text'
}
}, {
// 标签
name: 'tags',
meta: {
format: 'text',
selector: ['.source a'],
index: 0
}
}, {
// 网页关键字
name: 'keywords',
meta: {
format: 'meta',
selector: ['meta[name="keywords"]']
}
}, {
// 网页描述
name: 'description',
meta: {
format: 'meta',
selector: ['meta[name="description"]']
}
}, {
// 详情
name: 'content',
meta: {
selector: ['.content', '.thumb'],
format: 'html'
},
required: true
}, {
name: 'imagesCount',
meta: {
selector: ['.thumb'],
format: 'count',
countType: 'image'
},
defaultValue: 0
}, {
name: 'wordsCount',
meta: {
selector: ['.content'],
format: 'count',
countType: 'text'
},
defaultValue: 0
}, {
name: 'comments',
meta: {
selector: ['.stats-comments .number'],
format: 'text'
},
defaultValue: 0
}, {
name: 'likes',
meta: {
selector: ['.stats-vote .number'],
format: 'text'
},
defaultValue: 0
}],
// 是否模拟用户请求
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
// 编码 默认utf-8
charset: null,
// 回调函数 对所有数据做处理
afterExtractAll: function (data) {
data.fields['hits'] = 0;
return data;
},
afterExtractField: function (fieldsName, data) {
if (fieldsName === 'tags') {
data = data ? data.split(',') : [];
}
if (fieldsName === 'comments') {
data = +data;
}
if (fieldsName === 'likes') {
data = +data;
}
return data;
}
};
```
### 引入
```
const Crawler = require('almighty-parser-core')
const options = require('../parser/parser-qiushibaike.js')
const parser = new Crawler(options)
```
### API测试
#### parse
```
{ fields:
{ author: '草莓、牛奶巧克力',
tags: [ '搞笑图片' ],
keywords: '',
description: '笑死我了',
content: '<div class="content">\n\n笑死我了\n\n</div><div class="thumb">\n\n<img src="//pic.qiushibaike.com/system/pictures/11909/119095438/medium/app119095438.jpg" alt="糗事#119095438">\n\n</div>',
imagesCount: 1,
wordsCount: 4,
comments: 0,
likes: 457,
from: 'https://www.qiushibaike.com/article/119095438',
sourceId: 'com.qiushibaike.www-article-119095438',
site: 'www.qiushibaike.com',
hits: 0 },
urls:
[ 'https://www.qiushibaike.com/',
'https://www.qiushibaike.com/hot/',
'https://www.qiushibaike.com/imgrank/',
'https://www.qiushibaike.com/text/',
'https://www.qiushibaike.com/history/',
'https://www.qiushibaike.com/pic/',
'https://www.qiushibaike.com/textnew/',
'https://www.qiushibaike.com/my',
'https://www.qiushibaike.com/article/116423562',
'https://www.qiushibaike.com/article/116424718',
'https://www.qiushibaike.com/article/116421669',
'https://www.qiushibaike.com/article/116423344',
'https://www.qiushibaike.com/article/116426229',
'https://www.qiushibaike.com/article/116423107',
'https://www.qiushibaike.com/article/104614784',
'https://www.qiushibaike.com/article/104590828',
'https://www.qiushibaike.com/article/104629666',
'https://www.qiushibaike.com/article/104599846',
'https://www.qiushibaike.com/article/104598154',
'https://www.qiushibaike.com/article/104619022',
'https://www.qiushibaike.com/article/118954381',
'https://www.qiushibaike.com/article/118491926',
'https://www.qiushibaike.com/article/118563113',
'https://www.qiushibaike.com/article/118806836',
'https://www.qiushibaike.com/article/118525804',
'https://www.qiushibaike.com/article/118770803',
'https://www.qiushibaike.com/article/119008939',
'https://www.qiushibaike.com/article/119033005',
'https://www.qiushibaike.com/article/119036209',
'https://www.qiushibaike.com/article/118922421',
'https://www.qiushibaike.com/article/119014594',
'https://www.qiushibaike.com/article/119009873',
'https://www.qiushibaike.com/article/118934286',
'https://www.qiushibaike.com/joke/',
'https://www.qiushibaike.com/article/' ] }
```
其余接口测试请下载后运行
```
npm run test:qiushibaike
```
'use strict';
/**
* 爆料网
* http://www.baoliao5.com/
*/
module.exports = {
// 域名 网站域名,设置域名后只处理这些域名下的网页
domains: 'http://www.baoliao5.com/',
// 列表页url的正则,符合这些正则的页面会被当作列表页处理
listUrlRegexes: [/http:\/\/www\.baoliao5\.com\/((?!meitu)[a-z]+\/?)*$/, /http:\/\/www\.baoliao5\.com\/((?!meitu)[a-z]+\/?)+\/list[0-9_]+\.html*$/],
// 内容页url的正则,符合这些正则的页面会被当作内容页处理
contentUrlRegexes: [/http:\/\/www\.baoliao5\.com\/(?!meitu)[a-z]+\/[0-9]+\/[0-9]+\.html/],
// 从内容页中抽取需要的数据
fields: [{
// 标题
name: 'title',
meta: {
// 默认 type 为 jquery/text/xpath
selector: ['.t4Btit'],
format: 'text'
},
required: true
}, {
// 详情
name: 'content',
meta: {
selector: ['#icontent'],
format: 'html'
},
required: true
}, {
// 作者
name: 'author',
meta: {
selector: ['.t4Bexp'],
format: 'text'
}
}, {
// 标签
name: 'tags',
meta: {
format: 'text',
selector: ['.itj_lt .lc a'],
index: 1
}
}, {
// 网页关键字
name: 'keywords',
meta: {
format: 'meta',
selector: ['meta[name="keywords"]']
}
}, {
// 网页描述
name: 'description',
meta: {
format: 'meta',
selector: ['meta[name="description"]']
}
}, {
name: 'imagesCount',
meta: {
selector: ['#icontent'],
format: 'count',
countType: 'image'
},
defaultValue: 0
}, {
name: 'wordsCount',
meta: {
selector: ['#icontent'],
format: 'count',
countType: 'text'
},
defaultValue: 0
}, {
name: 'publishedAt',
meta: {
format: 'text',
selector: ['.t4Bexp']
}
}],
// 内容下一页
contentPage: {
urls: [/http:\/\/www\.baoliao5\.com\/(?!meitu)[a-z]+\/[0-9]+\/[0-9]+_[0-9]+\.html/],
selector: ['#icontent'],
appendNode: '#icontent'
},
// 是否模拟用户请求
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
// 编码 默认utf-8
charset: 'gb2312',
// 回调函数 对所有数据做处理
afterExtractAll: function (data) {
data.fields['comments'] = 0;
data.fields['hits'] = 0;
data.fields['likes'] = 0;
return data;
},
afterExtractField: function (fieldsName, data) {
if (fieldsName === 'author') {
data = data.trim()
if (data.indexOf('编辑:') >= 0) {
var arr = data.split('编辑:');
data = arr[arr.length - 1];
} else {
data = '';
}
}
if (fieldsName === 'publishedAt') {
data = new Date(data.replace(/[^0-9\-\: ]+/img, '')).getTime() || new Date().getTime();
}
if (fieldsName === 'tags') {
data = (data !== '') ? [data] : [];
}
return data;
}
};
'use strict';
/**
* healthNo1
* http://www.healthno1.com/
*/
module.exports = {
// 域名 网站域名,设置域名后只处理这些域名下的网页
domains: 'http://www.healthno1.com/',
// 列表页url的正则,符合这些正则的页面会被当作列表页处理
listUrlRegexes: [/^http:\/\/www\.healthno1\.com(\/[a-z_]+(\.html)?)*(\/)?(\?start=[0-9]+)?$/],
// 内容页url的正则,符合这些正则的页面会被当作内容页处理
contentUrlRegexes: [/^http:\/\/www\.healthno1\.com\/([a-z_]+\/)*[0-9-]+\.html$/],
// 从内容页中抽取需要的数据
fields: [{
// 标题
name: 'title',
meta: {
// 默认 type 为 jquery/text/xpath
selector: ['#gkContentWrap .item-page header h1'],
format: 'text'
},
required: true
}, {
// 详情
name: 'content',
meta: {
selector: ['#gkContentWrap .item-page .itemBody img', '#gkContentWrap .item-page .itemBody p'],
format: 'html'
},
required: true
}, {
// 作者
name: 'author',
meta: {
format: 'meta',
selector: ['meta[name="author"]']
}
}, {
// 标签
name: 'tags',
meta: {
format: 'text',
selector: ['.category-name a'],
index: 0
}
}, {
// 网页关键字
name: 'keywords',
meta: {
format: 'meta',
selector: ['meta[name="keywords"]']
}
}, {
// 网页描述
name: 'description',
meta: {
format: 'meta',
selector: ['meta[name="description"]']
}
}, {
name: 'imagesCount',
meta: {
selector: ['#gkContentWrap .item-page .itemBody img', '#gkContentWrap .item-page .itemBody p'],
format: 'count',
countType: 'image'
},
defaultValue: 0
}, {
name: 'wordsCount',
meta: {
selector: ['#gkContentWrap .item-page .itemBody img', '#gkContentWrap .item-page .itemBody p'],
format: 'count',
countType: 'text'
},
defaultValue: 0
}, {
name: 'publishedAt',
meta: {
format: 'text',
selector: ['.created time']
}
}, {
name: 'hits',
meta: {
format: 'text',
selector: ['.hits'],
index: 0
},
defaultValue: 0
}],
// 是否模拟用户请求
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
// 编码 默认utf-8
charset: null,
// 语言格式
i18n: 'tw2s',
// 回调函数 对所有数据做处理
afterExtractAll: function (data) {
data.fields['comments'] = 0;
data.fields['likes'] = 0;
return data;
},
afterExtractField: function (fieldsName, data) {
if (fieldsName === 'publishedAt') {
data = new Date(data.replace(/[^0-9\- \:]+/img, '')).getTime() || new Date().getTime();
}
if (fieldsName === 'tags') {
data = (data !== '') ? [data] : [];
}
if (fieldsName === 'title') {
data = data.trim();
}
if (fieldsName === 'hits') {
data = data.replace(/[^0-9]+/img, '') || 0;
}
return data;
}
};
'use strict';
/**
* 糗事百科
* https://www.qiushibaike.com/
*/
module.exports = {
// 域名 网站域名,设置域名后只处理这些域名下的网页
domains: 'https://www.qiushibaike.com/',
// 列表页url的正则,符合这些正则的页面会被当作列表页处理
listUrlRegexes: [/^https:\/\/www\.qiushibaike\.com(\/[a-z0-9]+(\/page\/[0-9]+)?)?(\/)?$/],
// 内容页url的正则,符合这些正则的页面会被当作内容页处理
contentUrlRegexes: [/^https:\/\/www\.qiushibaike\.com\/article\/[0-9]+$/],
// 从内容页中抽取需要的数据
fields: [{
// 作者
name: 'author',
meta: {
selector: ['.author h2'],
format: 'text'
}
}, {
// 标签
name: 'tags',
meta: {
format: 'text',
selector: ['.source a'],
index: 0
}
}, {
// 网页关键字
name: 'keywords',
meta: {
format: 'meta',
selector: ['meta[name="keywords"]']
}
}, {
// 网页描述
name: 'description',
meta: {
format: 'meta',
selector: ['meta[name="description"]']
}
}, {
// 详情
name: 'content',
meta: {
selector: ['.content', '.thumb'],
format: 'html'
},
required: true
}, {
name: 'imagesCount',
meta: {
selector: ['.thumb'],
format: 'count',
countType: 'image'
},
defaultValue: 0
}, {
name: 'wordsCount',
meta: {
selector: ['.content'],
format: 'count',
countType: 'text'
},
defaultValue: 0
}, {
name: 'comments',
meta: {
selector: ['.stats-comments .number'],
format: 'text'
},
defaultValue: 0
}, {
name: 'likes',
meta: {
selector: ['.stats-vote .number'],
format: 'text'
},
defaultValue: 0
}],
// 是否模拟用户请求
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
// 编码 默认utf-8
charset: null,
// 回调函数 对所有数据做处理
afterExtractAll: function (data) {
data.fields['hits'] = 0;
return data;
},
afterExtractField: function (fieldsName, data) {
if (fieldsName === 'tags') {
data = data ? data.split(',') : [];
}
if (fieldsName === 'comments') {
data = +data;
}
if (fieldsName === 'likes') {
data = +data;
}
return data;
}
};
'use strict';
const Crawler = require('../../index.js')
const options = require('../parser/parser-baoliao5.js')
const parser = new Crawler(options)
// const url = 'http://www.baoliao5.com/'
const url = 'http://www.baoliao5.com/yule/201701/1867.html'
// const url = 'http://www.baoliao5.com/yingshi/201701/1835.html'
// const url = 'http://www.baoliao5.com/yingshi/'
// const url = 'http://www.baoliao5.com/yingshi/list_7_11.html'
// const url = 'http://www.baoliao5.com/meitu/201701/1848.html'
// const url = 'http://www.baoliao5.com/yule/neidi/'
let errorItems = []
// 测试获取内容
async function testParseDate () {
try {
const result = await parser.parse(url)
console.log('获取数据内容为', result)
} catch (e) {
console.error('[抓取数据出错]', e.message)
errorItems.push('testParseDate')
}
}
// 检测链接是否是详情页
function testIsArticleUrl () {
try {
const result = parser.isArticleUrl(url)
console.log('获取数据内容为', result)
} catch (e) {
console.error('[抓取数据出错]', e.message)
errorItems.push('testIsArticleUrl')
}
}
// 测试页面链接的唯一标示
function testGetIdFromArticleUrl () {
try {
const result = parser.getIdFromArticleUrl(url)
console.log('获取数据内容为', result)
} catch (e) {
console.error('[抓取数据出错]', e.message)
errorItems.push('testGetIdFromArticleUrl')
}
}
// 获取详情页内容
async function testGetContent () {
try {
const result = await parser.getContent(url)
console.log('获取数据内容为', result)
} catch (e) {
console.error('[抓取数据出错]', e.message)
errorItems.push('testGetContent')
}
}
// 获取详情页内容
async function testGetLinks () {
try {
const result = await parser.getLinks(url)
console.log('获取数据内容为', result)
} catch (e) {
console.error('[抓取数据出错]', e.message)
errorItems.push('testGetLinks')
}
}
// 测试入口
async function start () {
console.log('测试开始')
console.log('------')
console.log('测试步骤1 获取内容')
await testParseDate()
console.log('测试步骤1 获取内容 结束')
console.log('------')
console.log('测试步骤2 校验链接是否为详情页')
testIsArticleUrl()
console.log('测试步骤2 校验链接是否为详情页 结束')
console.log('------')
console.log('测试步骤3 获取页面链接的唯一标示')
testGetIdFromArticleUrl()
console.log('测试步骤3 获取页面链接的唯一标示 结束')
console.log('------')
console.log('测试步骤4 获取详情页内容')
// await testGetContent()
console.log('测试步骤4 获取详情页内容 结束')
console.log('------')
console.log('测试步骤5 获取列表页内容')
await testGetLinks()
console.log('测试步骤5 获取列表页内容 结束')
console.log('------')
console.log('所有接口均已测试结束')
if (errorItems.length) {
console.log('测试结果: ', errorItems.join(','), '异常。')
} else {
console.log('测试结果: 所有接口都正常。')
}
}
start()
'use strict';
const Crawler = require('../../index.js')
const options = require('../parser/parser-healthno1.js')
const parser = new Crawler(options)
// const url = 'http://www.healthno1.com/'
// const url = 'http://www.healthno1.com/feature_articles.html?start=12'
// const url = 'http://www.healthno1.com/feature_articles.html'
// const url = 'http://www.healthno1.com/health_info/16841-2017-05-12-03-10-00.html'
const url = 'http://www.healthno1.com/16939-2017-05-19-10-16-00.html'
let errorItems = []
// 测试获取内容
async function testParseDate () {
try {
const result = await parser.parse(url)
console.log('获取数据内容为', result)
} catch (e) {
console.error('[抓取数据出错]', e.message)
errorItems.push('testParseDate')
}
}
// 检测链接是否是详情页
function testIsArticleUrl () {
try {
const result = parser.isArticleUrl(url)
console.log('获取数据内容为', result)
} catch (e) {
console.error('[抓取数据出错]', e.message)
errorItems.push('testIsArticleUrl')
}
}
// 测试页面链接的唯一标示
function testGetIdFromArticleUrl () {
try {
const result = parser.getIdFromArticleUrl(url)
console.log('获取数据内容为', result)
} catch (e) {
console.error('[抓取数据出错]', e.message)
errorItems.push('testGetIdFromArticleUrl')
}
}
// 获取详情页内容
async function testGetContent () {
try {
const result = await parser.getContent(url)
console.log('获取数据内容为', result)
} catch (e) {
console.error('[抓取数据出错]', e.message)
errorItems.push('testGetContent')
}
}
// 获取详情页内容
async function testGetLinks () {
try {
const result = await parser.getLinks(url)
console.log('获取数据内容为', result)
} catch (e) {
console.error('[抓取数据出错]', e.message)
errorItems.push('testGetLinks')
}
}
// 测试入口
async function start () {
console.log('测试开始')
console.log('------')
console.log('测试步骤1 获取内容')
await testParseDate()
console.log('测试步骤1 获取内容 结束')
console.log('------')
console.log('测试步骤2 校验链接是否为详情页')
testIsArticleUrl()
console.log('测试步骤2 校验链接是否为详情页 结束')
console.log('------')
console.log('测试步骤3 获取页面链接的唯一标示')
testGetIdFromArticleUrl()
console.log('测试步骤3 获取页面链接的唯一标示 结束')
console.log('------')
console.log('测试步骤4 获取详情页内容')
// await testGetContent()
console.log('测试步骤4 获取详情页内容 结束')
console.log('------')
console.log('测试步骤5 获取列表页内容')
// await testGetLinks()
console.log('测试步骤5 获取列表页内容 结束')
console.log('------')
console.log('所有接口均已测试结束')
if (errorItems.length) {
console.log('测试结果: ', errorItems.join(','), '异常。')
} else {
console.log('测试结果: 所有接口都正常。')
}
}
start()
'use strict';
const Crawler = require('../../index.js')
const options = require('../parser/parser-qiushibaike.js')
const parser = new Crawler(options)
// const url = 'https://www.qiushibaike.com/hot/'
// const url = 'https://www.qiushibaike.com/hot/page/4/?s=4987995'
// const url = 'https://www.qiushibaike.com/article/119101871'
// const url = 'https://www.qiushibaike.com/article/119102864'
const url = 'https://www.qiushibaike.com/article/119095438'
let errorItems = []
// 测试获取内容
async function testParseDate () {
try {
const result = await parser.parse(url)
console.log('获取数据内容为', result)
} catch (e) {
console.error('[抓取数据出错]', e.message)
errorItems.push('testParseDate')
}
}
// 检测链接是否是详情页
function testIsArticleUrl () {
try {
const result = parser.isArticleUrl(url)
console.log('获取数据内容为', result)
} catch (e) {
console.error('[抓取数据出错]', e.message)
errorItems.push('testIsArticleUrl')
}
}
// 检测链接是否是列表页
function testIsListUrl () {
try {
const result = parser.isListUrl(url)
console.log('获取数据内容为', result)
} catch (e) {
console.error('[抓取数据出错]', e.message)
errorItems.push('testIsListUrl')
}
}
// 测试页面链接的唯一标示
function testGetIdFromArticleUrl () {
try {
const result = parser.getIdFromArticleUrl(url)
console.log('获取数据内容为', result)
} catch (e) {
console.error('[抓取数据出错]', e.message)
errorItems.push('testGetIdFromArticleUrl')
}
}
// 获取详情页内容
async function testGetContent () {
try {
const result = await parser.getContent(url)
console.log('获取数据内容为', result)
} catch (e) {
console.error('[抓取数据出错]', e.message)
errorItems.push('testGetContent')
}
}
// 获取详情页内容
async function testGetLinks () {
try {
const result = await parser.getLinks(url)
console.log('获取数据内容为', result)
} catch (e) {
console.error('[抓取数据出错]', e.message)
errorItems.push('testGetLinks')
}
}
// 测试入口
async function start () {
console.log('测试开始')
console.log('------')
console.log('测试步骤1 获取内容')
await testParseDate()
console.log('测试步骤1 获取内容 结束')
console.log('------')
console.log('测试步骤2 校验链接是否为详情页')
testIsArticleUrl()
console.log('测试步骤2 校验链接是否为详情页 结束')
console.log('------')
console.log('测试步骤3 校验链接是否为列表页')
testIsListUrl()
console.log('测试步骤3 校验链接是否为列表页 结束')
console.log('------')
console.log('测试步骤4 获取页面链接的唯一标示')
testGetIdFromArticleUrl()
console.log('测试步骤4 获取页面链接的唯一标示 结束')
console.log('------')
console.log('测试步骤5 获取详情页内容')
// await testGetContent()
console.log('测试步骤5 获取详情页内容 结束')
console.log('------')
console.log('测试步骤6 获取列表页内容')
// await testGetLinks()
console.log('测试步骤6 获取列表页内容 结束')
console.log('------')
console.log('所有接口均已测试结束')
if (errorItems.length) {
console.log('测试结果: ', errorItems.join(','), '异常。')
} else {
console.log('测试结果: 所有接口都正常。')
}
}
start()
const Crawler = require('./lib/crawler.js')
module.exports = Crawler;
\ No newline at end of file
'use strict';
var async = require('async');
var helper = require('./helper'),
parser = require('./parser');
function Crawler (options) {
options = options || {};
if (!['domains', 'listUrlRegexes', 'contentUrlRegexes', 'fields'].some(key => key in options)) {
throw new Error('options is invalid data format.');
}
// 初始化
this._init(options);
};
/**
* 初始化
*/
Crawler.prototype._init = function (options) {
options.domains = helper.formatUrl(options.domains);
// 核心
this.domains = options.domains || '';// 域名 首页
this.listUrlRegexes = options.listUrlRegexes || [];// 列表页url的正则
this.contentUrlRegexes = options.contentUrlRegexes || [];// 内容页url的正则
this.fields = options.fields || [];// 从内容页中抽取需要的数据
this.contentPage = options.contentPage || null;// 下一页
this.sourceId = options.sourceId || [2, 5, 4];// 唯一标示组成
// 配置
this.userAgent = options.userAgent || null;// 模拟用户请求
this.charset = options.charset || null;// 编码
this.format = options.format || 'html';// 请求格式 http|json|jsonp
this.i18n = options.i18n || null;// 转译 繁体转简体 s2t | t2s | s2tw | tw2s | s2hk | hk2s | t2tw | t2hk
// 函数
this.afterExtractField = options.afterExtractField || null;// 对每一个抓取的数据进行处理
this.afterExtractAll = options.afterExtractAll || null;// 对完整的数据进行一个处理
this.afterExtractUrls = options.afterExtractUrls || null;// 对抓取的url进行一个处理
this.attachFields = options.attachFields || null;// 附加数据
};
/**
* 检测链接类型
* 可选参数
* type: list|post
*/
Crawler.prototype._judge = function (url, type) {
var result = '';
if (!type || type === 'list') {
this.listUrlRegexes.forEach(function (urlRegex) {
if (urlRegex.test(url)) {
result = 'list';
}
});
}
if (!type || type === 'post') {
this.contentUrlRegexes.forEach(function (urlRegex) {
if (urlRegex.test(url)) {
result = 'post';
}
});
}
return type ? result === type : result;
};
/**
* 根据url生成唯一标示
*/
Crawler.prototype._getSourceId = function (url) {
var type = this._judge(url);
if (!type) {
console.error('The url type is not list or post.');
return null;
}
var regex = /(\w+):\/\/([^\:|\/]+)(\:\d*)?(.*\/)([^#|\?|\n]+)?(#.*)?(\?.*)?/i;
var arr = url.match(regex);
this._site = arr[2];
var sources = '';
this.sourceId.forEach(function (item) {
if (!!arr[item]) {
if (item === 2) {
sources += arr[item].split('.').reverse().join('.');
} else {
sources += arr[item].replace(/\//img, '-').replace('.', '-');
}
}
});
sources = ((sources.substring(sources.length - 1) === '-') ? sources.substring(0, sources.length - 1) : sources).trim();
return sources;
};
/**
* 解析详情页
*/
Crawler.prototype._getContent = function (url, callback) {
var self = this;
self.url = url;
var result = {};
var resultAttachFields = {};
result.bodyData = null;
result.fields = null;
// 处理附加数据
var getAttachBodyFields = function (done) {
if (!self._judge(url, 'post')) {
return done(null);
}
if (!self.attachFields) {
return done(null);
}
if (!self.attachFields.url) {
return done(null);
}
parser.getAttachUrl({
url: self.attachFields.url,
meta: self.attachFields.meta,
body: result.bodyData
}, function (error, _url) {
helper.request(_url, {
format: self.format,
charset: self.charset,
userAgent: self.userAgent
}, function (error, body) {
if (error) {
return done(error);
}
resultAttachFields = parser.getFieldsBySelector(body, self.attachFields.fields);
done(error);
});
});
};
var getBodyPage = function (done) {
if (!(self._judge(url, 'post') && !!result.bodyData && !!self.contentPage)) {
return done(null);
}
// 处理下一页
parser.getContentPage(self, { body: result.bodyData, url }, function (error, body) {
if (error) {
return done(error);
}
if (body) {
result.bodyData = body;
}
done(error);
});
};
var getBodyFields = function (done) {
helper.request(url, {
format: self.format,
charset: self.charset,
userAgent: self.userAgent
}, function (error, body) {
if (error) {
return done(error);
}
result.bodyData = body;
done(error);
});
};
async.waterfall([getBodyFields, getBodyPage, getAttachBodyFields], function (error) {
if (error) {
return callback(error);
}
if (self._judge(url, 'post') && !!result.bodyData) {
// 获取数据
result.fields = parser.getFields(result.bodyData, self);
result.fields.from = url;
result.fields.sourceId = self._getSourceId(url);
result.fields.site = self._site;
// 附加数据
for (var name in resultAttachFields) {
result.fields[name] = resultAttachFields[name];
}
// 处理完整数据
if (self.afterExtractAll) {
result = self.afterExtractAll(result);
}
}
callback(error, result);
});
};
/**
* 解析列表页
*/
Crawler.prototype._getLinks = function (url, callback) {
var self = this;
helper.request(url, {
format: self.format,
charset: self.charset,
userAgent: self.userAgent
}, function (error, body) {
var result = {};
result.urls = null;
if (body) {
result.urls = self._parseUrls(body, url);
}
callback(error, result);
});
};
/**
* 解析url
*/
Crawler.prototype._parseUrls = function (bodyData, url) {
var self = this;
self.url = url;
return parser.getUrls(bodyData, self);
};
/**
* 解析获取内容[为`getLinks`与`getContent`的集合]
*/
Crawler.prototype.parse = function (url, callback) {
url = helper.formatUrl(url);
var self = this;
var type = null;
var result = {};
var bodyData = null;
// 获取页面的数据
var parserUrls = function (data, done) {
result.urls = null;
if (!!bodyData) {
result.urls = self._parseUrls(bodyData, url);
}
done(null, result);
};
// 获取页面的链接
var parserFields = function (type, done) {
self._getContent(url, function (error, data) {
if (!data || error) {
return done(error);
}
bodyData = data.bodyData;
result.fields = data.fields;
done(null, result);
});
};
// 判断是否为url
var judge = function (done) {
type = self._judge(url);
if (type) {
done(null, type);
} else {
done('url mismatch');
}
};
return new Promise(function (resolve, reject) {
async.waterfall([judge, parserFields, parserUrls], function (error, result) {
if (error) {
console.error(error);
if (callback) return callback(error);
return reject(error);
}
resolve(result);
if (callback) {
callback(null, result);
}
});
});
};
/**
* 获取待抓页链接
*/
Crawler.prototype.getLinks = function (url, callback) {
url = helper.formatUrl(url);
var self = this;
var type = this._judge(url);
if (!type) return null;
return new Promise(function (resolve, reject) {
self._getLinks(url, function (error, result) {
if (error) {
console.error(error);
if (callback) return callback(error);
return reject(error);
}
resolve(result.urls);
if (callback) {
callback(null, result.urls);
}
});
});
};
/**
* 获取详情页内容
*/
Crawler.prototype.getContent = function (url, callback) {
url = helper.formatUrl(url);
var self = this;
var type = this._judge(url);
if (!type) return null;
if (!this.isArticleUrl(url)) return null;
return new Promise(function (resolve, reject) {
self._getContent(url, function (error, result) {
if (error) {
console.error(error);
if (callback) return callback(error);
return reject(error);
}
resolve(result.fields);
if (callback) {
callback(null, result.fields);
}
});
});
};
/**
* 检测链接是否是详情页
*/
Crawler.prototype.isArticleUrl = function (url) {
url = helper.formatUrl(url);
return this._judge(url, 'post');
};
/**
* 检测链接是否是列表页
*/
Crawler.prototype.isListUrl = function (url) {
url = helper.formatUrl(url);
return this._judge(url, 'list');
};
/**
* 获取页面链接的唯一标示
*/
Crawler.prototype.getIdFromArticleUrl = function (url) {
url = helper.formatUrl(url);
var type = this._judge(url);
return type ? this._getSourceId(url) : null;
};
module.exports = Crawler;
'use strict'
var request = require('request'),
iconv = require('iconv-lite'),
OpenCC = require('opencc');
/**
* 代理
*/
var _proxy = function () {
var proxy = process.env.HTTP_PROXY || process.env.HTTPS_PROXY || process.env.ALL_PROXY;
if (proxy) {
request = request.defaults({'proxy': proxy});
}
};
/**
* 请求核心
*/
var _requestCore = function (url, options, callback) {
_proxy();
var query = {};
query.url = url;
query.headers = {};
if (options.charset && options.charset !== 'utf-8') {
query.encoding = null;
}
if (options.userAgent) {
query.headers = {
'User-Agent': options.userAgent
};
}
request.get(query, function (err, res, body) {
if (!err && res.statusCode === 200) {
if (options.charset && options.charset !== 'utf-8') {
body = iconv.decode(body, options.charset);// 处理转码问题
}
callback(err, body);
} else {
console.error(err);
return callback(err);
}
});
};
/**
* 多种类型请求
*/
var _request = {
html: function (url, options, callback) {
_requestCore(url, options, function (error, body) {
callback(error, body);
});
},
json: function (url, options, callback) {
_requestCore(url, options, function (error, body) {
body = JSON.parse(body);
callback(error, body);
});
},
jsonp: function (url, options, callback) {
_requestCore(url, options, function (error, body) {
body = body.substring(9, body.length - 1);
body = JSON.parse(body);
callback(error, body);
});
}
};
/**
* 请求接口
* 支持http/json/jsonp
*/
var requestUrl = function (url, options, callback) {
options.format = options.format || 'html';
if (options.format === 'html') {
_request.html(url, options, callback);
} else if (options.format === 'json') {
_request.json(url, options, callback);
} else if (options.format === 'jsonp') {
_request.jsonp(url, options, callback);
} else {
console.error('The request format is error.');
}
};
/**
* 转义 i18n
*
* 支持的类型:
* 简体到繁体 s2t
* 繁体到简体 t2s
* 简体到台湾正体 s2tw
* 台湾正体到简体 tw2s
* 简体到香港繁体 s2hk
* 香港繁体到简体 hk2s
* 繁体到台湾正体 t2tw
* 繁体到香港繁体 t2hk
*/
var translate = function (str, type) {
type = type || 'tw2s'
if (['s2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 't2tw', 't2hk'].indexOf(type) < 0) {
console.error(type, 'in i18n is null');
return str;
}
var opencc = new OpenCC(type + '.json');
var converted = opencc.convertSync(str);
return converted;
};
/**
* 追加首页链接结尾的/
*/
var formatUrl = function (url) {
if (url.split('/').length - 1 === 2) {
url += '/';
}
return url;
};
/**
* 转码
*/
var encode = function (str) {
return str.replace(/[^\u0000-\u00FF]/g, function ($0) {
return escape($0).replace(/(%u)(\w{4})/gi, "&#x$2")
});
};
var rencode = function (str) {
return unescape(str.replace(/(&#x)(\w{4});/gi, "%u$2")).replace(/%uA0/img, ' ').replace(/&#xA0;/img, ' ');
};
/**
* 数组去重
*/
var deDuplication = function (arr) {
var filterObj = {};
arr = arr.filter(function (_item) {
if (!filterObj[_item]) {
filterObj[_item] = true;
return true;
} else {
return false;
}
});
return arr;
};
module.exports = {
request: requestUrl,
translate: translate,
formatUrl: formatUrl,
encode: encode,
rencode: rencode,
deDuplication: deDuplication
};
This diff is collapsed.
This diff is collapsed.
{
"name": "almighty-parser-core",
"version": "1.0.7",
"description": "crawler prser core",
"main": "index.js",
"scripts": {
"test:qiushibaike": "node --harmony-async-await ./example/test/qiushibaike.js",
"test:healthno1": "node --harmony-async-await ./example/test/healthno1.js",
"test:baoliao5": "node --harmony-async-await ./example/test/baoliao5.js"
},
"repository": {
"type": "git",
"url": "git@github.com:coolfishstudio/almighty-parser-core.git"
},
"keywords": "crawler, parser",
"author": "Yves",
"license": "MIT",
"dependencies": {
"async": "^2.4.1",
"cheerio": "^1.0.0-rc.1",
"iconv-lite": "^0.4.17",
"opencc": "^1.0.5",
"request": "^2.81.0",
"xmldom": "^0.1.27",
"xpath": "0.0.24"
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment