Initial commit

c804c41e · coolfish · c804c41e · c804c41e · c804c41e · c804c41e
Commit c804c41e authored Jul 13, 2017 by coolfish
19 changed files
--- a/.gitignore
+++ b/.gitignore
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+
+# Runtime data
+pids
+*.pid
+*.seed
+*.pid.lock
+
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+
+# Coverage directory used by tools like istanbul
+coverage
+
+# nyc test coverage
+.nyc_output
+
+# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
+.grunt
+
+# Bower dependency directory (https://bower.io/)
+bower_components
+
+# node-waf configuration
+.lock-wscript
+
+# Compiled binary addons (http://nodejs.org/api/addons.html)
+build/Release
+
+# Dependency directories
+node_modules/
+jspm_packages/
+
+# Typescript v1 declaration files
+typings/
+
+# Optional npm cache directory
+.npm
+
+# Optional eslint cache
+.eslintcache
+
+# Optional REPL history
+.node_repl_history
+
+# Output of 'npm pack'
+*.tgz
+
+# Yarn Integrity file
+.yarn-integrity
+
+# dotenv environment variables file
+.env
--- a/.npmignore
+++ b/.npmignore
+doc
+example
--- a/LICENSE
+++ b/LICENSE
+MIT License
+
+Copyright (c) 2017 coolfish
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
--- a/README.md
+++ b/README.md
+# 页面爬虫解析器核心
+此工具适用于
+1. 对单独页面链接进行解析
+2. 配合队列进行多页面解析
+
+
+## 解释说明
+支持详情页下一页抓取，支持繁体转换，支持对字数统计，支持对图片数量统计。
+目前主要针对静态页的解析，对json请求和jsonp请求的解析做了预留(暂不支持)。
+
+[![NPM](https://nodei.co/npm/almighty-parser-core.png?downloads=true&downloadRank=true&stars=true)](https://nodei.co/npm/almighty-parser-core/)
+
+[![npm](https://img.shields.io/npm/v/almighty-parser-core.svg)]()
+[![npm](https://img.shields.io/npm/dm/almighty-parser-core.svg)]()
+[![license](https://img.shields.io/github/license/coolfishstudio/almighty-parser-core.svg)]()
+
+## 安装
+```
+npm i --save almighty-parser-core
+```
+
+## api接口
+- [x] `getLinks` 获取待抓页链接
+- [x] `getContent` 获取详情页内容
+- [x] `parse` 解析获取内容[为`getLinks`与`getContent`的集合]
+- [x] `isArticleUrl` 检测链接是否是详情页
+- [x] `isListUrl` 检测链接是否是列表页
+- [x] `getIdFromArticleUrl` 获取页面链接的唯一标示
+
+## 配置参数
+[文档说明](https://github.com/coolfishstudio/almighty-parser-core/blob/master/doc/CONFIG.md)
+
+## 实例
+### 解析器案例
+[糗事百科 - 基础](https://github.com/coolfishstudio/almighty-parser-core/blob/master/example/parser/parser-qiushibaike.js)
+[今日健康 - 繁体](https://github.com/coolfishstudio/almighty-parser-core/blob/master/example/parser/parser-healthno1.js)
+[爆料网 - 详情下一页](https://github.com/coolfishstudio/almighty-parser-core/blob/master/example/parser/parser-baoliao5.js)
+
+### 定义网站规则
+```
+module.exports = {
+    // 域名 网站域名,设置域名后只处理这些域名下的网页
+    domains: 'https://www.qiushibaike.com/',
+    // 列表页url的正则，符合这些正则的页面会被当作列表页处理
+    listUrlRegexes: [/^https:\/\/www\.qiushibaike\.com(\/[a-z0-9]+(\/page\/[0-9]+)?)?(\/)?$/],
+    // 内容页url的正则，符合这些正则的页面会被当作内容页处理
+    contentUrlRegexes: [/^https:\/\/www\.qiushibaike\.com\/article\/[0-9]+$/],
+    // 从内容页中抽取需要的数据
+    fields: [{
+        // 作者
+        name: 'author',
+        meta: {
+            selector: ['.author h2'],
+            format: 'text'
+        }
+    }, {
+        // 标签 
+        name: 'tags',
+        meta: {
+            format: 'text',
+            selector: ['.source a'],
+            index: 0
+        }
+    }, {
+        // 网页关键字
+        name: 'keywords',
+        meta: {
+            format: 'meta',
+            selector: ['meta[name="keywords"]']
+        }
+    }, {
+        // 网页描述
+        name: 'description',
+        meta: {
+            format: 'meta',
+            selector: ['meta[name="description"]']
+        }
+    }, {
+        // 详情
+        name: 'content',
+        meta: {
+            selector: ['.content', '.thumb'],
+            format: 'html'
+        },
+        required: true
+    }, {
+        name: 'imagesCount',
+        meta: {
+            selector: ['.thumb'],
+            format: 'count',
+            countType: 'image'
+        },
+        defaultValue: 0
+    }, {
+        name: 'wordsCount',
+        meta: {
+            selector: ['.content'],
+            format: 'count',
+            countType: 'text'
+        },
+        defaultValue: 0
+    }, {
+        name: 'comments',
+        meta: {
+            selector: ['.stats-comments .number'],
+            format: 'text'
+        },
+        defaultValue: 0
+    }, {
+        name: 'likes',
+        meta: {
+            selector: ['.stats-vote .number'],
+            format: 'text'
+        },
+        defaultValue: 0
+    }],
+    // 是否模拟用户请求
+    userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
+    // 编码 默认utf-8
+    charset: null,
+    // 回调函数 对所有数据做处理
+    afterExtractAll: function (data) {
+        data.fields['hits'] = 0;
+        return data;
+    },
+    afterExtractField: function (fieldsName, data) {
+        if (fieldsName === 'tags') {
+            data = data ? data.split(',') : [];
+        }
+        if (fieldsName === 'comments') {
+            data = +data;
+        }
+        if (fieldsName === 'likes') {
+            data = +data;
+        }
+        return data;
+    }
+};
+```
+
+### 引入
+```
+const Crawler = require('almighty-parser-core')
+const options = require('../parser/parser-qiushibaike.js')
+const parser = new Crawler(options)
+```
+
+### API测试
+[测试案例](https://github.com/coolfishstudio/almighty-parser-core/blob/master/example/test/qiushibaike.js)
+#### parse
+```
+{ fields:
+   { author: '草莓、牛奶巧克力',
+     tags: [ '搞笑图片' ],
+     keywords: '',
+     description: '笑死我了',
+     content: '<div class="content">\n\n笑死我了\n\n</div><div class="thumb">\n\n<img src="//pic.qiushibaike.com/system/pictures/11909/119095438/medium/app119095438.jpg" alt="糗事#119095438">\n\n</div>',
+     imagesCount: 1,
+     wordsCount: 4,
+     comments: 0,
+     likes: 457,
+     from: 'https://www.qiushibaike.com/article/119095438',
+     sourceId: 'com.qiushibaike.www-article-119095438',
+     site: 'www.qiushibaike.com',
+     hits: 0 },
+  urls:
+   [ 'https://www.qiushibaike.com/',
+     'https://www.qiushibaike.com/hot/',
+     'https://www.qiushibaike.com/imgrank/',
+     'https://www.qiushibaike.com/text/',
+     'https://www.qiushibaike.com/history/',
+     'https://www.qiushibaike.com/pic/',
+     'https://www.qiushibaike.com/textnew/',
+     'https://www.qiushibaike.com/my',
+     'https://www.qiushibaike.com/article/116423562',
+     'https://www.qiushibaike.com/article/116424718',
+     'https://www.qiushibaike.com/article/116421669',
+     'https://www.qiushibaike.com/article/116423344',
+     'https://www.qiushibaike.com/article/116426229',
+     'https://www.qiushibaike.com/article/116423107',
+     'https://www.qiushibaike.com/article/104614784',
+     'https://www.qiushibaike.com/article/104590828',
+     'https://www.qiushibaike.com/article/104629666',
+     'https://www.qiushibaike.com/article/104599846',
+     'https://www.qiushibaike.com/article/104598154',
+     'https://www.qiushibaike.com/article/104619022',
+     'https://www.qiushibaike.com/article/118954381',
+     'https://www.qiushibaike.com/article/118491926',
+     'https://www.qiushibaike.com/article/118563113',
+     'https://www.qiushibaike.com/article/118806836',
+     'https://www.qiushibaike.com/article/118525804',
+     'https://www.qiushibaike.com/article/118770803',
+     'https://www.qiushibaike.com/article/119008939',
+     'https://www.qiushibaike.com/article/119033005',
+     'https://www.qiushibaike.com/article/119036209',
+     'https://www.qiushibaike.com/article/118922421',
+     'https://www.qiushibaike.com/article/119014594',
+     'https://www.qiushibaike.com/article/119009873',
+     'https://www.qiushibaike.com/article/118934286',
+     'https://www.qiushibaike.com/joke/',
+     'https://www.qiushibaike.com/article/' ] }
+```
+
+其余接口测试请下载后运行
+```
+npm run test:qiushibaike
+```
+
+## License
+
+[MIT License](https://opensource.org/licenses/MIT)
--- a/doc/API.md
+++ b/doc/API.md
+## API 文档
+
+- [x] `getLinks` 获取待抓页链接
+- [x] `getContent` 获取详情页内容
+- [x] `parse` 解析获取内容[为`getLinks`与`getContent`的集合]
+- [x] `isArticleUrl` 检测链接是否是详情页
+- [x] `isListUrl` 检测链接是否是列表页
+- [x] `getIdFromArticleUrl` 获取页面链接的唯一标示
--- a/doc/CONFIG.md
+++ b/doc/CONFIG.md
+## 配置参数
+
+针对不同 要有自己定义的配置
+
+注意 目前只支持html静态页的内容抓取
+
+配置 | 描述 | 是否必填 | 类型
+------------- | ------------- | ------------- | -------------
+domains | 网站域名 | 必填 | 字符串
+listUrlRegexes | 列表页url的正则，符合这些正则的页面会被当作列表页处理 | 必填 | 数组
+contentUrlRegexes | 内容页url的正则，符合这些正则的页面会被当作内容页处理 | 必填 | 数组
+fields | 从内容页中抽取需要的数据 | 必填 | fields示例
+userAgent | 是否模拟用户请求 | 选填 | 字符串
+charset | 编码 默认utf-8 | 选填 | 字符串
+afterExtractField | 回调函数 对每一个抽取出来的数据进行处理 | 选填 | 方法
+afterExtractAll | 回调函数 对所有抽取出来的数据进行处理 | 选填 | 方法
+contentPage | 对详情页下一页内容处理 | 选填 | contentPage示例
+
+## fields示例
+字段 | 描述 | 类型
+------------- | ------------- | -------------
+name | 定义字段名字 | 字符串 必填
+meta | 选择器 | meta示例 必填
+defaultValue | 默认值 | 任意 选填
+
+###  meta示例
+字段 | 描述 | 类型
+selector | 选择器(支持多个拼接) | 数组 必填
+format | 返回是否含有标签[text/html/meta 默认text] | 字符串 选填
+index | 下标 | 数字 选填
+
+## contentPage示例
+字段 | 描述 | 类型
+------------- | ------------- | -------------
+urls | 下一页的正则 | 数组 必填
+selector | 选择器 | 数组 必填
+appendNode | 插入的位置 | 任意 必填
\ No newline at end of file
--- a/doc/EXAMPLE.md
+++ b/doc/EXAMPLE.md
+## 实例
+### 定义网站规则
+```
+module.exports = {
+    // 域名 网站域名,设置域名后只处理这些域名下的网页
+    domains: 'https://www.qiushibaike.com/',
+    // 列表页url的正则，符合这些正则的页面会被当作列表页处理
+    listUrlRegexes: [/^https:\/\/www\.qiushibaike\.com(\/[a-z0-9]+(\/page\/[0-9]+)?)?(\/)?$/],
+    // 内容页url的正则，符合这些正则的页面会被当作内容页处理
+    contentUrlRegexes: [/^https:\/\/www\.qiushibaike\.com\/article\/[0-9]+$/],
+    // 从内容页中抽取需要的数据
+    fields: [{
+        // 作者
+        name: 'author',
+        meta: {
+            selector: ['.author h2'],
+            format: 'text'
+        }
+    }, {
+        // 标签 
+        name: 'tags',
+        meta: {
+            format: 'text',
+            selector: ['.source a'],
+            index: 0
+        }
+    }, {
+        // 网页关键字
+        name: 'keywords',
+        meta: {
+            format: 'meta',
+            selector: ['meta[name="keywords"]']
+        }
+    }, {
+        // 网页描述
+        name: 'description',
+        meta: {
+            format: 'meta',
+            selector: ['meta[name="description"]']
+        }
+    }, {
+        // 详情
+        name: 'content',
+        meta: {
+            selector: ['.content', '.thumb'],
+            format: 'html'
+        },
+        required: true
+    }, {
+        name: 'imagesCount',
+        meta: {
+            selector: ['.thumb'],
+            format: 'count',
+            countType: 'image'
+        },
+        defaultValue: 0
+    }, {
+        name: 'wordsCount',
+        meta: {
+            selector: ['.content'],
+            format: 'count',
+            countType: 'text'
+        },
+        defaultValue: 0
+    }, {
+        name: 'comments',
+        meta: {
+            selector: ['.stats-comments .number'],
+            format: 'text'
+        },
+        defaultValue: 0
+    }, {
+        name: 'likes',
+        meta: {
+            selector: ['.stats-vote .number'],
+            format: 'text'
+        },
+        defaultValue: 0
+    }],
+    // 是否模拟用户请求
+    userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
+    // 编码 默认utf-8
+    charset: null,
+    // 回调函数 对所有数据做处理
+    afterExtractAll: function (data) {
+        data.fields['hits'] = 0;
+        return data;
+    },
+    afterExtractField: function (fieldsName, data) {
+        if (fieldsName === 'tags') {
+            data = data ? data.split(',') : [];
+        }
+        if (fieldsName === 'comments') {
+            data = +data;
+        }
+        if (fieldsName === 'likes') {
+            data = +data;
+        }
+        return data;
+    }
+};
+```
+
+### 引入
+```
+const Crawler = require('almighty-parser-core')
+const options = require('../parser/parser-qiushibaike.js')
+const parser = new Crawler(options)
+```
+
+### API测试
+#### parse
+```
+{ fields:
+   { author: '草莓、牛奶巧克力',
+     tags: [ '搞笑图片' ],
+     keywords: '',
+     description: '笑死我了',
+     content: '<div class="content">\n\n笑死我了\n\n</div><div class="thumb">\n\n<img src="//pic.qiushibaike.com/system/pictures/11909/119095438/medium/app119095438.jpg" alt="糗事#119095438">\n\n</div>',
+     imagesCount: 1,
+     wordsCount: 4,
+     comments: 0,
+     likes: 457,
+     from: 'https://www.qiushibaike.com/article/119095438',
+     sourceId: 'com.qiushibaike.www-article-119095438',
+     site: 'www.qiushibaike.com',
+     hits: 0 },
+  urls:
+   [ 'https://www.qiushibaike.com/',
+     'https://www.qiushibaike.com/hot/',
+     'https://www.qiushibaike.com/imgrank/',
+     'https://www.qiushibaike.com/text/',
+     'https://www.qiushibaike.com/history/',
+     'https://www.qiushibaike.com/pic/',
+     'https://www.qiushibaike.com/textnew/',
+     'https://www.qiushibaike.com/my',
+     'https://www.qiushibaike.com/article/116423562',
+     'https://www.qiushibaike.com/article/116424718',
+     'https://www.qiushibaike.com/article/116421669',
+     'https://www.qiushibaike.com/article/116423344',
+     'https://www.qiushibaike.com/article/116426229',
+     'https://www.qiushibaike.com/article/116423107',
+     'https://www.qiushibaike.com/article/104614784',
+     'https://www.qiushibaike.com/article/104590828',
+     'https://www.qiushibaike.com/article/104629666',
+     'https://www.qiushibaike.com/article/104599846',
+     'https://www.qiushibaike.com/article/104598154',
+     'https://www.qiushibaike.com/article/104619022',
+     'https://www.qiushibaike.com/article/118954381',
+     'https://www.qiushibaike.com/article/118491926',
+     'https://www.qiushibaike.com/article/118563113',
+     'https://www.qiushibaike.com/article/118806836',
+     'https://www.qiushibaike.com/article/118525804',
+     'https://www.qiushibaike.com/article/118770803',
+     'https://www.qiushibaike.com/article/119008939',
+     'https://www.qiushibaike.com/article/119033005',
+     'https://www.qiushibaike.com/article/119036209',
+     'https://www.qiushibaike.com/article/118922421',
+     'https://www.qiushibaike.com/article/119014594',
+     'https://www.qiushibaike.com/article/119009873',
+     'https://www.qiushibaike.com/article/118934286',
+     'https://www.qiushibaike.com/joke/',
+     'https://www.qiushibaike.com/article/' ] }
+```
+
+其余接口测试请下载后运行
+```
+npm run test:qiushibaike
+```
+
+
+
+
+
--- a/example/parser/parser-baoliao5.js
+++ b/example/parser/parser-baoliao5.js
+'use strict';
+/**
+ * 爆料网
+ * http://www.baoliao5.com/
+ */
+module.exports = {
+    // 域名 网站域名,设置域名后只处理这些域名下的网页
+    domains: 'http://www.baoliao5.com/',
+    // 列表页url的正则，符合这些正则的页面会被当作列表页处理
+    listUrlRegexes: [/http:\/\/www\.baoliao5\.com\/((?!meitu)[a-z]+\/?)*$/, /http:\/\/www\.baoliao5\.com\/((?!meitu)[a-z]+\/?)+\/list[0-9_]+\.html*$/],
+    // 内容页url的正则，符合这些正则的页面会被当作内容页处理
+    contentUrlRegexes: [/http:\/\/www\.baoliao5\.com\/(?!meitu)[a-z]+\/[0-9]+\/[0-9]+\.html/],
+    // 从内容页中抽取需要的数据
+    fields: [{
+        // 标题
+        name: 'title',
+        meta: {
+            // 默认 type 为 jquery/text/xpath
+            selector: ['.t4Btit'],
+            format: 'text'
+        },
+        required: true
+    }, {
+        // 详情
+        name: 'content',
+        meta: {
+            selector: ['#icontent'],
+            format: 'html'
+        },
+        required: true
+    }, {
+        // 作者
+        name: 'author',
+        meta: {
+            selector: ['.t4Bexp'],
+            format: 'text'
+        }
+    }, {
+        // 标签 
+        name: 'tags',
+        meta: {
+            format: 'text',
+            selector: ['.itj_lt .lc a'],
+            index: 1
+        }
+    }, {
+        // 网页关键字
+        name: 'keywords',
+        meta: {
+            format: 'meta',
+            selector: ['meta[name="keywords"]']
+        }
+    }, {
+        // 网页描述
+        name: 'description',
+        meta: {
+            format: 'meta',
+            selector: ['meta[name="description"]']
+        }
+    }, {
+        name: 'imagesCount',
+        meta: {
+            selector: ['#icontent'],
+            format: 'count',
+            countType: 'image'
+        },
+        defaultValue: 0
+    }, {
+        name: 'wordsCount',
+        meta: {
+            selector: ['#icontent'],
+            format: 'count',
+            countType: 'text'
+        },
+        defaultValue: 0
+    }, {
+        name: 'publishedAt',
+        meta: {
+            format: 'text',
+            selector: ['.t4Bexp']
+        }
+    }],
+    // 内容下一页
+    contentPage: {
+        urls: [/http:\/\/www\.baoliao5\.com\/(?!meitu)[a-z]+\/[0-9]+\/[0-9]+_[0-9]+\.html/],
+        selector: ['#icontent'],
+        appendNode: '#icontent'
+    },
+    // 是否模拟用户请求
+    userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
+    // 编码 默认utf-8
+    charset: 'gb2312',
+    // 回调函数 对所有数据做处理
+    afterExtractAll: function (data) {
+        data.fields['comments'] = 0;
+        data.fields['hits'] = 0;
+        data.fields['likes'] = 0;
+        return data;
+    },
+    afterExtractField: function (fieldsName, data) {
+        if (fieldsName === 'author') {
+            data = data.trim()
+            if (data.indexOf('编辑：') >= 0) {
+                var arr = data.split('编辑：');
+                data = arr[arr.length - 1];
+            } else {
+                data = '';
+            }
+        }
+        if (fieldsName === 'publishedAt') {
+            data = new Date(data.replace(/[^0-9\-\: ]+/img, '')).getTime() || new Date().getTime();
+        }
+        if (fieldsName === 'tags') {
+            data = (data !== '') ? [data] : [];
+        }
+        return data;
+    }
+};
--- a/example/parser/parser-healthno1.js
+++ b/example/parser/parser-healthno1.js
+'use strict';
+/**
+ * healthNo1
+ * http://www.healthno1.com/
+ */
+module.exports = {
+    // 域名 网站域名,设置域名后只处理这些域名下的网页
+    domains: 'http://www.healthno1.com/',
+    // 列表页url的正则，符合这些正则的页面会被当作列表页处理
+    listUrlRegexes: [/^http:\/\/www\.healthno1\.com(\/[a-z_]+(\.html)?)*(\/)?(\?start=[0-9]+)?$/],
+    // 内容页url的正则，符合这些正则的页面会被当作内容页处理
+    contentUrlRegexes: [/^http:\/\/www\.healthno1\.com\/([a-z_]+\/)*[0-9-]+\.html$/],
+    // 从内容页中抽取需要的数据
+    fields: [{
+        // 标题
+        name: 'title',
+        meta: {
+            // 默认 type 为 jquery/text/xpath
+            selector: ['#gkContentWrap .item-page header h1'],
+            format: 'text'
+        },
+        required: true
+    }, {
+        // 详情
+        name: 'content',
+        meta: {
+            selector: ['#gkContentWrap .item-page .itemBody img', '#gkContentWrap .item-page .itemBody p'],
+            format: 'html'
+        },
+        required: true
+    }, {
+        // 作者
+        name: 'author',
+        meta: {
+            format: 'meta',
+            selector: ['meta[name="author"]']
+        }
+    }, {
+        // 标签 
+        name: 'tags',
+        meta: {
+            format: 'text',
+            selector: ['.category-name a'],
+            index: 0
+        }
+    }, {
+        // 网页关键字
+        name: 'keywords',
+        meta: {
+            format: 'meta',
+            selector: ['meta[name="keywords"]']
+        }
+    }, {
+        // 网页描述
+        name: 'description',
+        meta: {
+            format: 'meta',
+            selector: ['meta[name="description"]']
+        }
+    }, {
+        name: 'imagesCount',
+        meta: {
+            selector: ['#gkContentWrap .item-page .itemBody img', '#gkContentWrap .item-page .itemBody p'],
+            format: 'count',
+            countType: 'image'
+        },
+        defaultValue: 0
+    }, {
+        name: 'wordsCount',
+        meta: {
+            selector: ['#gkContentWrap .item-page .itemBody img', '#gkContentWrap .item-page .itemBody p'],
+            format: 'count',
+            countType: 'text'
+        },
+        defaultValue: 0
+    }, {
+        name: 'publishedAt',
+        meta: {
+            format: 'text',
+            selector: ['.created time']
+        }
+    }, {
+        name: 'hits',
+        meta: {
+            format: 'text',
+            selector: ['.hits'],
+            index: 0
+        },
+        defaultValue: 0
+    }],
+    // 是否模拟用户请求
+    userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
+    // 编码 默认utf-8
+    charset: null,
+    // 语言格式
+    i18n: 'tw2s',
+    // 回调函数 对所有数据做处理
+    afterExtractAll: function (data) {
+        data.fields['comments'] = 0;
+        data.fields['likes'] = 0;
+        return data;
+    },
+    afterExtractField: function (fieldsName, data) {
+        if (fieldsName === 'publishedAt') {
+            data = new Date(data.replace(/[^0-9\- \:]+/img, '')).getTime() || new Date().getTime();
+        }
+        if (fieldsName === 'tags') {
+            data = (data !== '') ? [data] : [];
+        }
+        if (fieldsName === 'title') {
+            data = data.trim();
+        }
+        if (fieldsName === 'hits') {
+            data = data.replace(/[^0-9]+/img, '') || 0;
+        }
+        return data;
+    }
+};
--- a/example/parser/parser-qiushibaike.js
+++ b/example/parser/parser-qiushibaike.js
+'use strict';
+/**
+ * 糗事百科
+ * https://www.qiushibaike.com/
+ */
+module.exports = {
+    // 域名 网站域名,设置域名后只处理这些域名下的网页
+    domains: 'https://www.qiushibaike.com/',
+    // 列表页url的正则，符合这些正则的页面会被当作列表页处理
+    listUrlRegexes: [/^https:\/\/www\.qiushibaike\.com(\/[a-z0-9]+(\/page\/[0-9]+)?)?(\/)?$/],
+    // 内容页url的正则，符合这些正则的页面会被当作内容页处理
+    contentUrlRegexes: [/^https:\/\/www\.qiushibaike\.com\/article\/[0-9]+$/],
+    // 从内容页中抽取需要的数据
+    fields: [{
+        // 作者
+        name: 'author',
+        meta: {
+            selector: ['.author h2'],
+            format: 'text'
+        }
+    }, {
+        // 标签 
+        name: 'tags',
+        meta: {
+            format: 'text',
+            selector: ['.source a'],
+            index: 0
+        }
+    }, {
+        // 网页关键字
+        name: 'keywords',
+        meta: {
+            format: 'meta',
+            selector: ['meta[name="keywords"]']
+        }
+    }, {
+        // 网页描述
+        name: 'description',
+        meta: {
+            format: 'meta',
+            selector: ['meta[name="description"]']
+        }
+    }, {
+        // 详情
+        name: 'content',
+        meta: {
+            selector: ['.content', '.thumb'],
+            format: 'html'
+        },
+        required: true
+    }, {
+        name: 'imagesCount',
+        meta: {
+            selector: ['.thumb'],
+            format: 'count',
+            countType: 'image'
+        },
+        defaultValue: 0
+    }, {
+        name: 'wordsCount',
+        meta: {
+            selector: ['.content'],
+            format: 'count',
+            countType: 'text'
+        },
+        defaultValue: 0
+    }, {
+        name: 'comments',
+        meta: {
+            selector: ['.stats-comments .number'],
+            format: 'text'
+        },
+        defaultValue: 0
+    }, {
+        name: 'likes',
+        meta: {
+            selector: ['.stats-vote .number'],
+            format: 'text'
+        },
+        defaultValue: 0
+    }],
+    // 是否模拟用户请求
+    userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
+    // 编码 默认utf-8
+    charset: null,
+    // 回调函数 对所有数据做处理
+    afterExtractAll: function (data) {
+        data.fields['hits'] = 0;
+        return data;
+    },
+    afterExtractField: function (fieldsName, data) {
+        if (fieldsName === 'tags') {
+            data = data ? data.split(',') : [];
+        }
+        if (fieldsName === 'comments') {
+            data = +data;
+        }
+        if (fieldsName === 'likes') {
+            data = +data;
+        }
+        return data;
+    }
+};
--- a/example/test/baoliao5.js
+++ b/example/test/baoliao5.js
+'use strict';
+const Crawler = require('../../index.js')
+const options = require('../parser/parser-baoliao5.js')
+const parser = new Crawler(options)
+
+// const url = 'http://www.baoliao5.com/'
+const url = 'http://www.baoliao5.com/yule/201701/1867.html'
+// const url = 'http://www.baoliao5.com/yingshi/201701/1835.html'
+// const url = 'http://www.baoliao5.com/yingshi/'
+// const url = 'http://www.baoliao5.com/yingshi/list_7_11.html'
+// const url = 'http://www.baoliao5.com/meitu/201701/1848.html'
+// const url = 'http://www.baoliao5.com/yule/neidi/'
+
+let errorItems = []
+
+// 测试获取内容
+async function testParseDate () {
+    try {
+        const result = await parser.parse(url)
+        console.log('获取数据内容为', result)
+    } catch (e) {
+        console.error('[抓取数据出错]', e.message)
+        errorItems.push('testParseDate')
+    }
+}
+// 检测链接是否是详情页
+function testIsArticleUrl () {
+    try {
+        const result = parser.isArticleUrl(url)
+        console.log('获取数据内容为', result)
+    } catch (e) {
+        console.error('[抓取数据出错]', e.message)
+        errorItems.push('testIsArticleUrl')
+    }
+}
+// 测试页面链接的唯一标示
+function testGetIdFromArticleUrl () {
+    try {
+        const result = parser.getIdFromArticleUrl(url)
+        console.log('获取数据内容为', result)
+    } catch (e) {
+        console.error('[抓取数据出错]', e.message)
+        errorItems.push('testGetIdFromArticleUrl')
+    }
+}
+
+// 获取详情页内容
+async function testGetContent () {
+    try {
+        const result = await parser.getContent(url)
+        console.log('获取数据内容为', result)
+    } catch (e) {
+        console.error('[抓取数据出错]', e.message)
+        errorItems.push('testGetContent')
+    }
+}
+
+// 获取详情页内容
+async function testGetLinks () {
+    try {
+        const result = await parser.getLinks(url)
+        console.log('获取数据内容为', result)
+    } catch (e) {
+        console.error('[抓取数据出错]', e.message)
+        errorItems.push('testGetLinks')
+    }
+}
+
+// 测试入口
+async function start () {
+    console.log('测试开始')
+    console.log('－－－－－－')
+    console.log('测试步骤1 获取内容')
+    await testParseDate()
+    console.log('测试步骤1 获取内容 结束')
+    console.log('－－－－－－')
+    console.log('测试步骤2 校验链接是否为详情页')
+    testIsArticleUrl()
+    console.log('测试步骤2 校验链接是否为详情页 结束')
+    console.log('－－－－－－')
+    console.log('测试步骤3 获取页面链接的唯一标示')
+    testGetIdFromArticleUrl()
+    console.log('测试步骤3 获取页面链接的唯一标示 结束')
+    console.log('－－－－－－')
+    console.log('测试步骤4 获取详情页内容')
+    // await testGetContent()
+    console.log('测试步骤4 获取详情页内容 结束')
+    console.log('－－－－－－')
+    console.log('测试步骤5 获取列表页内容')
+    await testGetLinks()
+    console.log('测试步骤5 获取列表页内容 结束')
+    console.log('－－－－－－')
+    console.log('所有接口均已测试结束')
+    if (errorItems.length) {
+        console.log('测试结果: ', errorItems.join(','), '异常。')
+    } else {
+        console.log('测试结果: 所有接口都正常。')
+    }
+}
+start()
--- a/example/test/healthno1.js
+++ b/example/test/healthno1.js
+'use strict';
+const Crawler = require('../../index.js')
+const options = require('../parser/parser-healthno1.js')
+const parser = new Crawler(options)
+
+// const url = 'http://www.healthno1.com/'
+// const url = 'http://www.healthno1.com/feature_articles.html?start=12'
+// const url = 'http://www.healthno1.com/feature_articles.html'
+// const url = 'http://www.healthno1.com/health_info/16841-2017-05-12-03-10-00.html'
+const url = 'http://www.healthno1.com/16939-2017-05-19-10-16-00.html'
+
+let errorItems = []
+
+// 测试获取内容
+async function testParseDate () {
+    try {
+        const result = await parser.parse(url)
+        console.log('获取数据内容为', result)
+    } catch (e) {
+        console.error('[抓取数据出错]', e.message)
+        errorItems.push('testParseDate')
+    }
+}
+// 检测链接是否是详情页
+function testIsArticleUrl () {
+    try {
+        const result = parser.isArticleUrl(url)
+        console.log('获取数据内容为', result)
+    } catch (e) {
+        console.error('[抓取数据出错]', e.message)
+        errorItems.push('testIsArticleUrl')
+    }
+}
+// 测试页面链接的唯一标示
+function testGetIdFromArticleUrl () {
+    try {
+        const result = parser.getIdFromArticleUrl(url)
+        console.log('获取数据内容为', result)
+    } catch (e) {
+        console.error('[抓取数据出错]', e.message)
+        errorItems.push('testGetIdFromArticleUrl')
+    }
+}
+
+// 获取详情页内容
+async function testGetContent () {
+    try {
+        const result = await parser.getContent(url)
+        console.log('获取数据内容为', result)
+    } catch (e) {
+        console.error('[抓取数据出错]', e.message)
+        errorItems.push('testGetContent')
+    }
+}
+
+// 获取详情页内容
+async function testGetLinks () {
+    try {
+        const result = await parser.getLinks(url)
+        console.log('获取数据内容为', result)
+    } catch (e) {
+        console.error('[抓取数据出错]', e.message)
+        errorItems.push('testGetLinks')
+    }
+}
+
+// 测试入口
+async function start () {
+    console.log('测试开始')
+    console.log('－－－－－－')
+    console.log('测试步骤1 获取内容')
+    await testParseDate()
+    console.log('测试步骤1 获取内容 结束')
+    console.log('－－－－－－')
+    console.log('测试步骤2 校验链接是否为详情页')
+    testIsArticleUrl()
+    console.log('测试步骤2 校验链接是否为详情页 结束')
+    console.log('－－－－－－')
+    console.log('测试步骤3 获取页面链接的唯一标示')
+    testGetIdFromArticleUrl()
+    console.log('测试步骤3 获取页面链接的唯一标示 结束')
+    console.log('－－－－－－')
+    console.log('测试步骤4 获取详情页内容')
+    // await testGetContent()
+    console.log('测试步骤4 获取详情页内容 结束')
+    console.log('－－－－－－')
+    console.log('测试步骤5 获取列表页内容')
+    // await testGetLinks()
+    console.log('测试步骤5 获取列表页内容 结束')
+    console.log('－－－－－－')
+    console.log('所有接口均已测试结束')
+    if (errorItems.length) {
+        console.log('测试结果: ', errorItems.join(','), '异常。')
+    } else {
+        console.log('测试结果: 所有接口都正常。')
+    }
+}
+start()
--- a/example/test/qiushibaike.js
+++ b/example/test/qiushibaike.js
+'use strict';
+const Crawler = require('../../index.js')
+const options = require('../parser/parser-qiushibaike.js')
+const parser = new Crawler(options)
+
+// const url = 'https://www.qiushibaike.com/hot/'
+// const url = 'https://www.qiushibaike.com/hot/page/4/?s=4987995'
+// const url = 'https://www.qiushibaike.com/article/119101871'
+// const url = 'https://www.qiushibaike.com/article/119102864'
+const url = 'https://www.qiushibaike.com/article/119095438'
+
+let errorItems = []
+
+// 测试获取内容
+async function testParseDate () {
+    try {
+        const result = await parser.parse(url)
+        console.log('获取数据内容为', result)
+    } catch (e) {
+        console.error('[抓取数据出错]', e.message)
+        errorItems.push('testParseDate')
+    }
+}
+// 检测链接是否是详情页
+function testIsArticleUrl () {
+    try {
+        const result = parser.isArticleUrl(url)
+        console.log('获取数据内容为', result)
+    } catch (e) {
+        console.error('[抓取数据出错]', e.message)
+        errorItems.push('testIsArticleUrl')
+    }
+}
+// 检测链接是否是列表页
+function testIsListUrl () {
+    try {
+        const result = parser.isListUrl(url)
+        console.log('获取数据内容为', result)
+    } catch (e) {
+        console.error('[抓取数据出错]', e.message)
+        errorItems.push('testIsListUrl')
+    }
+}
+// 测试页面链接的唯一标示
+function testGetIdFromArticleUrl () {
+    try {
+        const result = parser.getIdFromArticleUrl(url)
+        console.log('获取数据内容为', result)
+    } catch (e) {
+        console.error('[抓取数据出错]', e.message)
+        errorItems.push('testGetIdFromArticleUrl')
+    }
+}
+
+// 获取详情页内容
+async function testGetContent () {
+    try {
+        const result = await parser.getContent(url)
+        console.log('获取数据内容为', result)
+    } catch (e) {
+        console.error('[抓取数据出错]', e.message)
+        errorItems.push('testGetContent')
+    }
+}
+
+// 获取详情页内容
+async function testGetLinks () {
+    try {
+        const result = await parser.getLinks(url)
+        console.log('获取数据内容为', result)
+    } catch (e) {
+        console.error('[抓取数据出错]', e.message)
+        errorItems.push('testGetLinks')
+    }
+}
+
+// 测试入口
+async function start () {
+    console.log('测试开始')
+    console.log('－－－－－－')
+    console.log('测试步骤1 获取内容')
+    await testParseDate()
+    console.log('测试步骤1 获取内容 结束')
+    console.log('－－－－－－')
+    console.log('测试步骤2 校验链接是否为详情页')
+    testIsArticleUrl()
+    console.log('测试步骤2 校验链接是否为详情页 结束')
+    console.log('－－－－－－')
+    console.log('测试步骤3 校验链接是否为列表页')
+    testIsListUrl()
+    console.log('测试步骤3 校验链接是否为列表页 结束')
+    console.log('－－－－－－')
+    console.log('测试步骤4 获取页面链接的唯一标示')
+    testGetIdFromArticleUrl()
+    console.log('测试步骤4 获取页面链接的唯一标示 结束')
+    console.log('－－－－－－')
+    console.log('测试步骤5 获取详情页内容')
+    // await testGetContent()
+    console.log('测试步骤5 获取详情页内容 结束')
+    console.log('－－－－－－')
+    console.log('测试步骤6 获取列表页内容')
+    // await testGetLinks()
+    console.log('测试步骤6 获取列表页内容 结束')
+    console.log('－－－－－－')
+    console.log('所有接口均已测试结束')
+    if (errorItems.length) {
+        console.log('测试结果: ', errorItems.join(','), '异常。')
+    } else {
+        console.log('测试结果: 所有接口都正常。')
+    }
+}
+start()
--- a/index.js
+++ b/index.js
+const Crawler = require('./lib/crawler.js')
+
+module.exports = Crawler;
\ No newline at end of file
--- a/lib/crawler.js
+++ b/lib/crawler.js
+'use strict';
+var async = require('async');
+
+var helper = require('./helper'),
+    parser = require('./parser'); 
+
+function Crawler (options) {
+    options = options || {};
+    if (!['domains', 'listUrlRegexes', 'contentUrlRegexes', 'fields'].some(key => key in options)) {
+        throw new Error('options is invalid data format.');
+    }
+    // 初始化
+    this._init(options);
+};
+/**
+ * 初始化
+ */
+Crawler.prototype._init = function (options) {
+    options.domains = helper.formatUrl(options.domains);
+    // 核心
+    this.domains = options.domains || '';// 域名 首页
+    this.listUrlRegexes = options.listUrlRegexes || [];// 列表页url的正则
+    this.contentUrlRegexes = options.contentUrlRegexes || [];// 内容页url的正则
+    this.fields = options.fields || [];// 从内容页中抽取需要的数据
+    this.contentPage = options.contentPage || null;// 下一页
+    this.sourceId = options.sourceId || [2, 5, 4];// 唯一标示组成
+    // 配置
+    this.userAgent = options.userAgent || null;// 模拟用户请求
+    this.charset = options.charset || null;// 编码
+    this.format = options.format || 'html';// 请求格式 http|json|jsonp
+    this.i18n = options.i18n || null;// 转译 繁体转简体 s2t | t2s | s2tw | tw2s | s2hk | hk2s | t2tw | t2hk
+    // 函数
+    this.afterExtractField = options.afterExtractField || null;// 对每一个抓取的数据进行处理
+    this.afterExtractAll = options.afterExtractAll || null;// 对完整的数据进行一个处理
+    this.afterExtractUrls = options.afterExtractUrls || null;// 对抓取的url进行一个处理
+    this.attachFields = options.attachFields || null;// 附加数据
+};
+/**
+ * 检测链接类型
+ * 可选参数
+ * type: list|post
+ */
+Crawler.prototype._judge = function (url, type) {
+    var result = '';
+    if (!type || type === 'list') {
+        this.listUrlRegexes.forEach(function (urlRegex) {
+            if (urlRegex.test(url)) {
+                result = 'list';
+            }
+        });
+    }
+    if (!type || type === 'post') {
+        this.contentUrlRegexes.forEach(function (urlRegex) {
+            if (urlRegex.test(url)) {
+                result = 'post';
+            }
+        });
+    }
+    return type ? result === type : result;
+};
+/**
+ * 根据url生成唯一标示
+ */
+Crawler.prototype._getSourceId = function (url) {
+    var type = this._judge(url);
+    if (!type) {
+        console.error('The url type is not list or post.');
+        return null;
+    }
+    var regex = /(\w+):\/\/([^\:|\/]+)(\:\d*)?(.*\/)([^#|\?|\n]+)?(#.*)?(\?.*)?/i;
+    var arr = url.match(regex);
+    this._site = arr[2];
+    var sources = '';
+    this.sourceId.forEach(function (item) {
+        if (!!arr[item]) {
+            if (item === 2) {
+                sources += arr[item].split('.').reverse().join('.');
+            } else {
+                sources += arr[item].replace(/\//img, '-').replace('.', '-');
+            }
+        }
+    });
+    sources = ((sources.substring(sources.length - 1) === '-') ? sources.substring(0, sources.length - 1) : sources).trim();
+    return sources;
+};
+/**
+ * 解析详情页
+ */
+Crawler.prototype._getContent = function (url, callback) {
+    var self = this;
+    self.url = url;
+    var result = {};
+    var resultAttachFields = {};
+    result.bodyData = null;
+    result.fields = null;
+    // 处理附加数据
+    var getAttachBodyFields = function (done) {
+        if (!self._judge(url, 'post')) {
+            return done(null);
+        }
+        if (!self.attachFields) {
+            return done(null);
+        }
+        if (!self.attachFields.url) {
+            return done(null);
+        }
+        parser.getAttachUrl({
+            url: self.attachFields.url,
+            meta: self.attachFields.meta,
+            body: result.bodyData
+        }, function (error, _url) {
+            helper.request(_url, {
+                format: self.format,
+                charset: self.charset,
+                userAgent: self.userAgent
+            }, function (error, body) {
+                if (error) {
+                    return done(error);
+                }
+                resultAttachFields = parser.getFieldsBySelector(body, self.attachFields.fields);
+                done(error);
+            });
+        });
+    };
+    var getBodyPage = function (done) {
+        if (!(self._judge(url, 'post') && !!result.bodyData && !!self.contentPage)) {
+            return done(null);
+        }
+        // 处理下一页
+        parser.getContentPage(self, { body: result.bodyData, url }, function (error, body) {
+            if (error) {
+                return done(error);
+            }
+            if (body) {
+                result.bodyData = body;
+            }
+            done(error);
+        });
+    };
+    var getBodyFields = function (done) {
+        helper.request(url, {
+            format: self.format,
+            charset: self.charset,
+            userAgent: self.userAgent
+        }, function (error, body) {
+            if (error) {
+                return done(error);
+            }
+            result.bodyData = body;
+            done(error);
+        });
+    };
+    async.waterfall([getBodyFields, getBodyPage, getAttachBodyFields], function (error) {
+        if (error) {
+            return callback(error);
+        }
+
+        if (self._judge(url, 'post') && !!result.bodyData) {
+            // 获取数据
+            result.fields = parser.getFields(result.bodyData, self);
+            result.fields.from = url;
+            result.fields.sourceId = self._getSourceId(url);
+            result.fields.site = self._site;
+            // 附加数据
+            for (var name in resultAttachFields) {
+                result.fields[name] = resultAttachFields[name];
+            }
+            // 处理完整数据
+            if (self.afterExtractAll) {
+                result = self.afterExtractAll(result);
+            }
+        }
+        callback(error, result);
+    });
+};
+/**
+ * 解析列表页
+ */
+Crawler.prototype._getLinks = function (url, callback) {
+    var self = this;
+    helper.request(url, {
+        format: self.format,
+        charset: self.charset,
+        userAgent: self.userAgent
+    }, function (error, body) {
+        var result = {};
+        result.urls = null;
+        if (body) {
+            result.urls = self._parseUrls(body, url);
+        }
+        callback(error, result);
+    });
+};
+/**
+ * 解析url
+ */
+Crawler.prototype._parseUrls = function (bodyData, url) {
+    var self = this;
+    self.url = url;
+    return parser.getUrls(bodyData, self);
+};
+/**
+ * 解析获取内容[为`getLinks`与`getContent`的集合]
+ */
+Crawler.prototype.parse = function (url, callback) {
+    url = helper.formatUrl(url);
+    var self = this;
+    var type = null;
+    var result = {};
+    var bodyData = null;
+
+    // 获取页面的数据
+    var parserUrls = function (data, done) {
+        result.urls = null;
+        if (!!bodyData) {
+            result.urls = self._parseUrls(bodyData, url);
+        }
+        done(null, result);
+    };
+    // 获取页面的链接
+    var parserFields = function (type, done) {
+        self._getContent(url, function (error, data) {
+            if (!data || error) {
+                return done(error);
+            }
+            bodyData = data.bodyData;
+            result.fields = data.fields;
+            done(null, result);
+        });
+    };
+    // 判断是否为url
+    var judge = function (done) {
+        type = self._judge(url);
+        if (type) {
+            done(null, type);
+        } else {
+            done('url mismatch');
+        }
+    };
+    return new Promise(function (resolve, reject) {
+        async.waterfall([judge, parserFields, parserUrls], function (error, result) {
+            if (error) {
+                console.error(error);
+                if (callback) return callback(error);
+                return reject(error);
+            }
+            resolve(result);
+            if (callback) {
+                callback(null, result);
+            }
+        });
+    });
+};
+/**
+ * 获取待抓页链接
+ */
+Crawler.prototype.getLinks = function (url, callback) {
+    url = helper.formatUrl(url);
+    var self = this;
+    var type = this._judge(url);
+    if (!type) return null;
+    return new Promise(function (resolve, reject) {
+        self._getLinks(url, function (error, result) {
+            if (error) {
+                console.error(error);
+                if (callback) return callback(error);
+                return reject(error);
+            }
+            resolve(result.urls);
+            if (callback) {
+                callback(null, result.urls);
+            }
+        });
+    });
+};
+/**
+ * 获取详情页内容
+ */
+Crawler.prototype.getContent = function (url, callback) {
+    url = helper.formatUrl(url);
+    var self = this;
+    var type = this._judge(url);
+    if (!type) return null;
+    if (!this.isArticleUrl(url)) return null;
+    return new Promise(function (resolve, reject) {
+        self._getContent(url, function (error, result) {
+            if (error) {
+                console.error(error);
+                if (callback) return callback(error);
+                return reject(error);
+            }
+            resolve(result.fields);
+            if (callback) {
+                callback(null, result.fields);
+            }
+        });
+    });
+};
+/**
+ * 检测链接是否是详情页
+ */
+Crawler.prototype.isArticleUrl = function (url) {
+    url = helper.formatUrl(url);
+    return this._judge(url, 'post');
+};
+/**
+ * 检测链接是否是列表页
+ */
+Crawler.prototype.isListUrl = function (url) {
+    url = helper.formatUrl(url);
+    return this._judge(url, 'list');
+};
+/**
+ * 获取页面链接的唯一标示
+ */
+Crawler.prototype.getIdFromArticleUrl = function (url) {
+    url = helper.formatUrl(url);
+    var type = this._judge(url);
+    return type ? this._getSourceId(url) : null;
+};
+
+module.exports = Crawler;
--- a/lib/helper.js
+++ b/lib/helper.js
+'use strict'
+var request = require('request'),
+    iconv = require('iconv-lite'),
+    OpenCC = require('opencc');
+
+/**
+ * 代理
+ */
+var _proxy = function () {
+    var proxy = process.env.HTTP_PROXY || process.env.HTTPS_PROXY || process.env.ALL_PROXY;
+    if (proxy) {
+        request = request.defaults({'proxy': proxy});
+    }
+};
+/**
+ * 请求核心
+ */
+var _requestCore = function (url, options, callback) {
+    _proxy();
+    var query = {};
+    query.url = url;
+    query.headers = {};
+    if (options.charset && options.charset !== 'utf-8') {
+        query.encoding = null;
+    }
+    if (options.userAgent) {
+        query.headers = {
+            'User-Agent': options.userAgent
+        };
+    }
+    request.get(query, function (err, res, body) {
+        if (!err && res.statusCode === 200) {
+            if (options.charset && options.charset !== 'utf-8') {
+                body = iconv.decode(body, options.charset);// 处理转码问题
+            }
+            callback(err, body);
+        } else {
+            console.error(err);
+            return callback(err);
+        }
+    });
+};
+/**
+ * 多种类型请求
+ */
+var _request = {
+    html: function (url, options, callback) {
+        _requestCore(url, options, function (error, body) {
+            callback(error, body);
+        });
+    },
+    json: function (url, options, callback) {
+        _requestCore(url, options, function (error, body) {
+            body = JSON.parse(body);
+            callback(error, body);
+        });
+    },
+    jsonp: function (url, options, callback) {
+        _requestCore(url, options, function (error, body) {
+            body = body.substring(9, body.length - 1);
+            body = JSON.parse(body);
+            callback(error, body);
+        });
+    }
+};
+
+/**
+ * 请求接口
+ * 支持http/json/jsonp
+ */
+var requestUrl = function (url, options, callback) {
+    options.format = options.format || 'html';
+    if (options.format === 'html') {
+        _request.html(url, options, callback);
+    } else if (options.format === 'json') {
+        _request.json(url, options, callback);
+    } else if (options.format === 'jsonp') {
+        _request.jsonp(url, options, callback);
+    } else {
+        console.error('The request format is error.');
+    }
+};
+/**
+ * 转义 i18n
+ *
+ * 支持的类型：
+ * 简体到繁体 s2t
+ * 繁体到简体 t2s
+ * 简体到台湾正体 s2tw
+ * 台湾正体到简体 tw2s
+ * 简体到香港繁体 s2hk
+ * 香港繁体到简体 hk2s
+ * 繁体到台湾正体 t2tw
+ * 繁体到香港繁体 t2hk
+ */
+var translate = function (str, type) {
+    type = type || 'tw2s'
+    if (['s2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 't2tw', 't2hk'].indexOf(type) < 0) {
+        console.error(type, 'in i18n is null');
+        return str;
+    }
+    var opencc = new OpenCC(type + '.json');
+    var converted = opencc.convertSync(str);
+    return converted;
+};
+
+/**
+ * 追加首页链接结尾的／
+ */
+var formatUrl = function (url) {
+    if (url.split('/').length - 1 === 2) {
+        url += '/';
+    }
+    return url;
+};
+/**
+ * 转码
+ */
+var encode = function (str) {
+    return str.replace(/[^\u0000-\u00FF]/g, function ($0) {
+        return escape($0).replace(/(%u)(\w{4})/gi, "&#x$2")
+    });
+};
+var rencode = function (str) {
+    return unescape(str.replace(/(&#x)(\w{4});/gi, "%u$2")).replace(/%uA0/img, ' ').replace(/&#xA0;/img, ' ');
+};
+/**
+ * 数组去重
+ */
+var deDuplication = function (arr) {
+    var filterObj = {};
+    arr = arr.filter(function (_item) {
+        if (!filterObj[_item]) {
+            filterObj[_item] = true;
+            return true;
+        } else {
+            return false;
+        }
+    });
+    return arr;
+};
+
+module.exports = {
+    request: requestUrl,
+    translate: translate,
+    formatUrl: formatUrl,
+    encode: encode,
+    rencode: rencode,
+    deDuplication: deDuplication
+};
--- a/lib/parser.js
+++ b/lib/parser.js
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
+{
+  "name": "almighty-parser-core",
+  "version": "1.0.7",
+  "description": "crawler prser core",
+  "main": "index.js",
+  "scripts": {
+    "test:qiushibaike": "node --harmony-async-await ./example/test/qiushibaike.js",
+    "test:healthno1": "node --harmony-async-await ./example/test/healthno1.js",
+    "test:baoliao5": "node --harmony-async-await ./example/test/baoliao5.js"
+  },
+  "repository": {
+    "type": "git",
+    "url": "git@github.com:coolfishstudio/almighty-parser-core.git"
+  },
+  "keywords": "crawler, parser",
+  "author": "Yves",
+  "license": "MIT",
+  "dependencies": {
+    "async": "^2.4.1",
+    "cheerio": "^1.0.0-rc.1",
+    "iconv-lite": "^0.4.17",
+    "opencc": "^1.0.5",
+    "request": "^2.81.0",
+    "xmldom": "^0.1.27",
+    "xpath": "0.0.24"
+  }
+}