From 4a720f8b599190ce0a59745c00a73d72aa3b309e Mon Sep 17 00:00:00 2001
From: "C. Bess" <cbess@quantumquinn.com>
Date: Sat, 30 Nov 2019 17:45:18 -0600
Subject: [PATCH 1/3] support max content length restriction

- refactor _downloadUrl to remove Promisify wrapper to expose event handlers
- add maxContentLength option
---
 lib/Crawler.js | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/lib/Crawler.js b/lib/Crawler.js
index ef8efd1..0bb2f31 100644
--- a/lib/Crawler.js
+++ b/lib/Crawler.js
@@ -6,7 +6,7 @@ var Crawler,
     Promise = require("bluebird"),
     urlMod = require("url"),
     NodeCache = require("node-cache"),
-    request = Promise.promisify(require("request")),
+    request = require("request"),
     robotsParser = require("robots-parser"),
     mime = require('mime-types'),
     _ = require("lodash"),
@@ -44,6 +44,7 @@ Crawler = function (opts) {
   this._outstandingRequests = 0;
   this._robotsIgnoreServerError = opts.robotsIgnoreServerError || false;
   this._robotsEnabled = (opts.robotsEnabled !== false);
+  this._maxContentLength = opts.maxContentLength || 0;
 };
 
 util.inherits(Crawler, EventEmitter);
@@ -379,7 +380,9 @@ Crawler.prototype._fireHandlers = function (contentType, body, url) {
  */
 Crawler.prototype._downloadUrl = function (url, followRedirect) {
   var defaultOptions,
-      requestOptions;
+      requestOptions,
+      totalBytes = 0,
+      self = this;
 
   defaultOptions = {
     url: url,
@@ -393,21 +396,32 @@ Crawler.prototype._downloadUrl = function (url, followRedirect) {
   };
   requestOptions = _.merge(defaultOptions, this.getRequestOptions());
 
-  return request(requestOptions).catch(function (err) {
-    err = new error.RequestError("A request error occured. " + err.message);
-
-    return Promise.reject(err);
-  }).then(function (response) {
-    var err;
+  return new Promise(function (resolve, reject) {
+    request(requestOptions, function (_, response) {
+      if (response.statusCode >= 400) {
+        var err = new error.HttpError("HTTP status code is " + response.statusCode);
+        err.statusCode = response.statusCode;
 
-    if (response.statusCode >= 400) {
-      err = new error.HttpError("HTTP status code is " + response.statusCode);
-      err.statusCode = response.statusCode;
+        reject(err);
+        return;
+      }
 
-      return Promise.reject(err);
-    }
+      resolve(response);
+    }).on('error', function (err) {
+      err = new error.RequestError("A request error occured. " + err.message);
+      reject(err);
+    }).on('data', function (data) {
+      if (self._maxContentLength <= 0) {
+        return;
+      }
 
-    return response;
+      // count bytes
+      totalBytes += data.length;
+      if (totalBytes > self._maxContentLength) {
+        this.abort();
+        reject(new error.RequestError('Max content length exceeded.'));
+      }
+    });
   });
 };
 

From 94a138e2a21d8b76ffcc05d8801153f7d0d5b0b7 Mon Sep 17 00:00:00 2001
From: "C. Bess" <cbess@quantumquinn.com>
Date: Sat, 30 Nov 2019 18:47:02 -0600
Subject: [PATCH 2/3] refactor to use max content length function - update
 readme

---
 README.md      |  2 ++
 lib/Crawler.js | 17 +++++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 426b842..dc3ed18 100644
--- a/README.md
+++ b/README.md
@@ -136,6 +136,7 @@ crawler with the following options:
 | robotsIgnoreServerError | Indicates if `500` status code response for robots.txt should be ignored. Defaults to `false`. |
 | userAgent | User agent to use for requests. This can be either a string or a function that takes the URL being crawled. Defaults to `Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)`. |
 | request | Object of options to be passed to [request](https://github.com/request/request). Note that request does not support an asynchronous (and distributed) cookie jar. |
+| maxContentLength | The maximum content length (bytes) for requests. This can be either a number or a function that takes the URL being downloaded. Defaults to `0` (no max). |
 
 Example usage:
 
@@ -154,6 +155,7 @@ The following methods are available:
 | getInterval | Get the interval setting. |
 | getConcurrentRequestsLimit | Get the maximum number of concurrent requests. |
 | getUserAgent | Get the user agent. |
+| getMaxContentLength | Get the maximum content length for the request. |
 | start | Start crawling. |
 | stop | Stop crawling. |
 | addHandler(handler) | Add a handler for all content types. |
diff --git a/lib/Crawler.js b/lib/Crawler.js
index 0bb2f31..93d459c 100644
--- a/lib/Crawler.js
+++ b/lib/Crawler.js
@@ -100,6 +100,19 @@ Crawler.prototype.getRequestOptions = function () {
   return this._request;
 };
 
+/**
+ * Get the maximum content length for the request.
+ *
+ * @return {number} Max content length in bytes.
+ */
+Crawler.prototype.getMaxContentLength = function (url) {
+  if (typeof this._maxContentLength === 'function') {
+    return this._maxContentLength(url);
+  }
+
+  return this._maxContentLength;
+};
+
 /**
  * Start the crawler. Pages will be crawled according to the configuration
  * provided to the Crawler's constructor.
@@ -411,13 +424,13 @@ Crawler.prototype._downloadUrl = function (url, followRedirect) {
       err = new error.RequestError("A request error occured. " + err.message);
       reject(err);
     }).on('data', function (data) {
-      if (self._maxContentLength <= 0) {
+      if (self.getMaxContentLength(url) <= 0) {
         return;
       }
 
       // count bytes
       totalBytes += data.length;
-      if (totalBytes > self._maxContentLength) {
+      if (totalBytes > self.getMaxContentLength(url)) {
         this.abort();
         reject(new error.RequestError('Max content length exceeded.'));
       }

From 59c1b73819ad15a23ecf32a97e1e647451f59240 Mon Sep 17 00:00:00 2001
From: "C. Bess" <cbess@quantumquinn.com>
Date: Wed, 4 Dec 2019 20:42:43 -0600
Subject: [PATCH 3/3] handle response error

---
 lib/Crawler.js | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/lib/Crawler.js b/lib/Crawler.js
index 93d459c..d3ce11a 100644
--- a/lib/Crawler.js
+++ b/lib/Crawler.js
@@ -410,16 +410,18 @@ Crawler.prototype._downloadUrl = function (url, followRedirect) {
   requestOptions = _.merge(defaultOptions, this.getRequestOptions());
 
   return new Promise(function (resolve, reject) {
-    request(requestOptions, function (_, response) {
-      if (response.statusCode >= 400) {
-        var err = new error.HttpError("HTTP status code is " + response.statusCode);
+    request(requestOptions, function (err, response) {
+      if (!response) {
+        err = new error.RequestError("A request error occured. " + err.message);
+        reject(err);
+      } else if (response.statusCode >= 400) {
+        err = new error.HttpError("HTTP status code is " + response.statusCode);
         err.statusCode = response.statusCode;
 
         reject(err);
-        return;
+      } else {
+        resolve(response);
       }
-
-      resolve(response);
     }).on('error', function (err) {
       err = new error.RequestError("A request error occured. " + err.message);
       reject(err);