From 6f402965e80d56dfe85d82066e9d92f7a41f05ea Mon Sep 17 00:00:00 2001 From: SimonC-Audigent Date: Mon, 23 Sep 2024 12:26:57 +0100 Subject: [PATCH] refactor code to use isExplictlyDisallowed --- README.md | 12 ++++++++--- Robots.js | 54 +++++++++++++++++--------------------------------- index.d.ts | 3 ++- test/Robots.js | 16 ++------------- 4 files changed, 31 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 47c3f32..6665594 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ var robots = robotsParser('http://www.example.com/robots.txt', [ robots.isAllowed('http://www.example.com/test.html', 'Sams-Bot/1.0'); // true robots.isAllowed('http://www.example.com/dir/test.html', 'Sams-Bot/1.0'); // true robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // true -robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0', true); // false +robots.isExplicitlyDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // false robots.getCrawlDelay('Sams-Bot/1.0'); // 1 robots.getSitemaps(); // ['http://example.com/sitemap.xml'] robots.getPreferredHost(); // example.com @@ -55,15 +55,21 @@ Returns true if crawling the specified URL is allowed for the specified user-age This will return `undefined` if the URL isn't valid for this robots.txt. -### isDisallowed(url, [ua], [explicit]) +### isDisallowed(url, [ua]) **boolean or undefined** Returns true if crawling the specified URL is not allowed for the specified user-agent. -In explicit mode, user agents wildcards are discarded. This will return `undefined` if the URL isn't valid for this robots.txt. +### isExplicitlyDisallowed(url, ua) + +**boolean or undefined** + +Returns trues if explicitly disallowed for the specified user agent (User Agent wildcards are discarded). + +This will return undefined if the URL is not valid for this robots.txt file. ### getMatchingLineNumber(url, [ua]) **number or undefined** diff --git a/Robots.js b/Robots.js index db26a93..9fb7cf5 100644 --- a/Robots.js +++ b/Robots.js @@ -397,7 +397,7 @@ Robots.prototype._getRule = function (url, ua, explicit) { * @return {boolean?} */ Robots.prototype.isAllowed = function (url, ua) { - var rule = this._getRule(url, ua); + var rule = this._getRule(url, ua, false); if (typeof rule === 'undefined') { return; @@ -421,54 +421,36 @@ Robots.prototype.isAllowed = function (url, ua) { * @return {number?} */ Robots.prototype.getMatchingLineNumber = function (url, ua) { - var rule = this._getRule(url, ua); + var rule = this._getRule(url, ua, false); return rule ? rule.lineNumber : -1; }; /** - * In standard mode, it returns the opposite of is allowed(). - * In explicit mode, it will return: - * - true if the the agent is explicitly disallowed (wildcard non included), - * - throws an error if the user agent is not specified, - * - and false otherwise. + * Returns the opposite of isAllowed() + * * @param {string} url - * @param {string} ua + * @param {string?} ua * @return {boolean} */ -Robots.prototype.isDisallowed = function (url, ua, explicit) { - if ((explicit === true) && (ua === undefined)) { - throw new Error("User Agent must be specified in explicit mode") - } - - var rule = this._getRule(url, ua, explicit); - if (typeof rule === 'undefined') { - return true; - } - return !(!rule || rule.allow); +Robots.prototype.isDisallowed = function (url, ua) { + return !this.isAllowed(url, ua); }; +/** + * Returns trues if explicitly disallowed + * for the specified user agent (User Agent wildcards are discarded). + * + * This will return undefined if the URL is not valid for this robots.txt file. + * @param {string} url + * @param {string} ua + * @return {boolean?} + */ Robots.prototype.isExplicitlyDisallowed = function(url, ua) { - var parsedUrl = parseUrl(url) || {}; - var userAgent = formatUserAgent(ua); - - // The base URL must match otherwise this robots.txt is not valid for it. - if ( - parsedUrl.protocol !== this._url.protocol || - parsedUrl.hostname !== this._url.hostname || - parsedUrl.port !== this._url.port - ) { - return; - } - - var rules = this._rules[userAgent] || []; - var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search); - var rule = findRule(path, rules); - + var rule = this._getRule(url, ua, true); if (typeof rule === 'undefined') { - return; + return true; } - return !(!rule || rule.allow); } diff --git a/index.d.ts b/index.d.ts index 852ddec..0cf4313 100644 --- a/index.d.ts +++ b/index.d.ts @@ -2,7 +2,8 @@ declare module 'robots-parser'; interface Robot { isAllowed(url: string, ua?: string): boolean | undefined; - isDisallowed(url: string, ua?: string, explicit?: boolean): boolean | undefined; + isDisallowed(url: string, ua?: string): boolean | undefined; + isExplicitlyDisallowed(url: string, ua: string): boolean | undefined; getMatchingLineNumber(url: string, ua?: string): number; getCrawlDelay(ua?: string): number | undefined; getSitemaps(): string[]; diff --git a/test/Robots.js b/test/Robots.js index 6f979ba..f1575ae 100644 --- a/test/Robots.js +++ b/test/Robots.js @@ -872,7 +872,7 @@ describe('Robots', function () { var userAgent = 'SomeBot'; var robots = robotsParser(url, contents); - expect(robots.isDisallowed(url, userAgent, true)).to.equal(false) + expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(false) }) it('should be disallowed when user agent equal robots rule in explicit mode', function () { @@ -885,18 +885,6 @@ describe('Robots', function () { var userAgent = 'SomeBot'; var robots = robotsParser(url, contents); - expect(robots.isDisallowed(url, userAgent, true)).to.equal(true) + expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(true) }) - - it('should throw an error when user agent is not set in explicit mode', function () { - var contents = [ - 'User-agent: SomeBot', - 'Disallow: /', - ].join('\n') - - var url = 'https://www.example.com/hello' - var robots = robotsParser(url, contents); - - expect(robots.isDisallowed.bind(robots, url, undefined, true)).to.throw("User Agent must be specified in explicit mode") - }) });