From 4a720f8b599190ce0a59745c00a73d72aa3b309e Mon Sep 17 00:00:00 2001 From: "C. Bess" Date: Sat, 30 Nov 2019 17:45:18 -0600 Subject: [PATCH 1/3] support max content length restriction - refactor _downloadUrl to remove Promisify wrapper to expose event handlers - add maxContentLength option --- lib/Crawler.js | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/lib/Crawler.js b/lib/Crawler.js index ef8efd1..0bb2f31 100644 --- a/lib/Crawler.js +++ b/lib/Crawler.js @@ -6,7 +6,7 @@ var Crawler, Promise = require("bluebird"), urlMod = require("url"), NodeCache = require("node-cache"), - request = Promise.promisify(require("request")), + request = require("request"), robotsParser = require("robots-parser"), mime = require('mime-types'), _ = require("lodash"), @@ -44,6 +44,7 @@ Crawler = function (opts) { this._outstandingRequests = 0; this._robotsIgnoreServerError = opts.robotsIgnoreServerError || false; this._robotsEnabled = (opts.robotsEnabled !== false); + this._maxContentLength = opts.maxContentLength || 0; }; util.inherits(Crawler, EventEmitter); @@ -379,7 +380,9 @@ Crawler.prototype._fireHandlers = function (contentType, body, url) { */ Crawler.prototype._downloadUrl = function (url, followRedirect) { var defaultOptions, - requestOptions; + requestOptions, + totalBytes = 0, + self = this; defaultOptions = { url: url, @@ -393,21 +396,32 @@ Crawler.prototype._downloadUrl = function (url, followRedirect) { }; requestOptions = _.merge(defaultOptions, this.getRequestOptions()); - return request(requestOptions).catch(function (err) { - err = new error.RequestError("A request error occured. " + err.message); - - return Promise.reject(err); - }).then(function (response) { - var err; + return new Promise(function (resolve, reject) { + request(requestOptions, function (_, response) { + if (response.statusCode >= 400) { + var err = new error.HttpError("HTTP status code is " + response.statusCode); + err.statusCode = response.statusCode; - if (response.statusCode >= 400) { - err = new error.HttpError("HTTP status code is " + response.statusCode); - err.statusCode = response.statusCode; + reject(err); + return; + } - return Promise.reject(err); - } + resolve(response); + }).on('error', function (err) { + err = new error.RequestError("A request error occured. " + err.message); + reject(err); + }).on('data', function (data) { + if (self._maxContentLength <= 0) { + return; + } - return response; + // count bytes + totalBytes += data.length; + if (totalBytes > self._maxContentLength) { + this.abort(); + reject(new error.RequestError('Max content length exceeded.')); + } + }); }); }; From 94a138e2a21d8b76ffcc05d8801153f7d0d5b0b7 Mon Sep 17 00:00:00 2001 From: "C. Bess" Date: Sat, 30 Nov 2019 18:47:02 -0600 Subject: [PATCH 2/3] refactor to use max content length function - update readme --- README.md | 2 ++ lib/Crawler.js | 17 +++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 426b842..dc3ed18 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,7 @@ crawler with the following options: | robotsIgnoreServerError | Indicates if `500` status code response for robots.txt should be ignored. Defaults to `false`. | | userAgent | User agent to use for requests. This can be either a string or a function that takes the URL being crawled. Defaults to `Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)`. | | request | Object of options to be passed to [request](https://github.com/request/request). Note that request does not support an asynchronous (and distributed) cookie jar. | +| maxContentLength | The maximum content length (bytes) for requests. This can be either a number or a function that takes the URL being downloaded. Defaults to `0` (no max). | Example usage: @@ -154,6 +155,7 @@ The following methods are available: | getInterval | Get the interval setting. | | getConcurrentRequestsLimit | Get the maximum number of concurrent requests. | | getUserAgent | Get the user agent. | +| getMaxContentLength | Get the maximum content length for the request. | | start | Start crawling. | | stop | Stop crawling. | | addHandler(handler) | Add a handler for all content types. | diff --git a/lib/Crawler.js b/lib/Crawler.js index 0bb2f31..93d459c 100644 --- a/lib/Crawler.js +++ b/lib/Crawler.js @@ -100,6 +100,19 @@ Crawler.prototype.getRequestOptions = function () { return this._request; }; +/** + * Get the maximum content length for the request. + * + * @return {number} Max content length in bytes. + */ +Crawler.prototype.getMaxContentLength = function (url) { + if (typeof this._maxContentLength === 'function') { + return this._maxContentLength(url); + } + + return this._maxContentLength; +}; + /** * Start the crawler. Pages will be crawled according to the configuration * provided to the Crawler's constructor. @@ -411,13 +424,13 @@ Crawler.prototype._downloadUrl = function (url, followRedirect) { err = new error.RequestError("A request error occured. " + err.message); reject(err); }).on('data', function (data) { - if (self._maxContentLength <= 0) { + if (self.getMaxContentLength(url) <= 0) { return; } // count bytes totalBytes += data.length; - if (totalBytes > self._maxContentLength) { + if (totalBytes > self.getMaxContentLength(url)) { this.abort(); reject(new error.RequestError('Max content length exceeded.')); } From 59c1b73819ad15a23ecf32a97e1e647451f59240 Mon Sep 17 00:00:00 2001 From: "C. Bess" Date: Wed, 4 Dec 2019 20:42:43 -0600 Subject: [PATCH 3/3] handle response error --- lib/Crawler.js | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/Crawler.js b/lib/Crawler.js index 93d459c..d3ce11a 100644 --- a/lib/Crawler.js +++ b/lib/Crawler.js @@ -410,16 +410,18 @@ Crawler.prototype._downloadUrl = function (url, followRedirect) { requestOptions = _.merge(defaultOptions, this.getRequestOptions()); return new Promise(function (resolve, reject) { - request(requestOptions, function (_, response) { - if (response.statusCode >= 400) { - var err = new error.HttpError("HTTP status code is " + response.statusCode); + request(requestOptions, function (err, response) { + if (!response) { + err = new error.RequestError("A request error occured. " + err.message); + reject(err); + } else if (response.statusCode >= 400) { + err = new error.HttpError("HTTP status code is " + response.statusCode); err.statusCode = response.statusCode; reject(err); - return; + } else { + resolve(response); } - - resolve(response); }).on('error', function (err) { err = new error.RequestError("A request error occured. " + err.message); reject(err);