RobotsTxt

Fetch and parse the robots.txt

Introduction

The RobotsTxt is a DEX8 helper which gets robots.txt file by the HTTP request. and convert it into a Javascript Object.
robots.txt

Supported Directives

The RobotsTxt parser supports the following robots.txt directives:
  • User-agent:
  • Allow:
  • Disallow:
  • Sitemap:
  • Crawl-delay:

Properties

Property Description Type Default Example
base_url base url consisted of the protocol and host string https://www.adsuu.com:80
robotsTxt_url Location of the robots.txt file string https://www.adsuu.com:80/robots.txt
userAgent User agent used to fetch the robots.txt string Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36 Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
botName crawler, spider name string Googlebot, baiduspider, bingbot, slurp, yandex
robotsTxtObj JS Object which represent converted robots.txt file object {'*': {allow: [], disallow: []}}

Class "RobotsTxt"

new RobotsTxt(baseURL, userAgent, botName)

const { RobotsTxt } = require('dex8-sdk');
const baseURL = 'https://www.adsuu.com';
const userAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36';
const botName = 'MediaPartners-Google';

const robotsTxt = new RobotsTxt(baseURL, userAgent, botName);

const fja = async () => {
  await robotsTxt.parser();
  const follow_urls =  robotsTxt.whatToFollow();
  const unfollow_urls =  robotsTxt.whatToUnfollow();
  console.log('\nfollow_urls:: ', follow_urls);
  console.log('\nunfollow_urls:: ', unfollow_urls);
};

fja();

RobotsTxt class is injected by the DEX8 system so usage is even more simple.
----------------- respect_robotsTxt.js ------------------------

module.exports = async (x, lib) => {
  const echo = lib.echo;
  const RobotsTxt = lib.RobotsTxt;

  const baseURL = 'https://www.adsuu.com';
  const userAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36';
  const botName = 'MediaPartners-Google';

  const robotsTxt = new RobotsTxt(baseURL, userAgent, botName);

  await robotsTxt.parser();
  const follow_urls =  robotsTxt.whatToFollow();
  const unfollow_urls =  robotsTxt.whatToUnfollow();

  echo.log('follow_urls:: ', follow_urls);
  echo.log('unfollow_urls:: ', unfollow_urls);


  return x;
};

Methods

async fetch()

Fetches the robots.txt and read the file content.
const robotsTxtFile = await robotsTxt.fetch();

async parser()

Takes robots.txt fetched text and convert it to the JavaScript object.
const robotsTxtObj = await robotsTxt.parser();
https://www.google.com/robots.txt
--------------------------------------
robotsTxtObj::
{
  '*': {
    allow: [
      '/search/about',
      '/search/static',
      '/search/howsearchworks',
      '/?hl=',
      '/?hl=.*&gws_rd=ssl$',
      '/?gws_rd=ssl$',
      '/?pt1=true$',
      '/m/finance',
      '/books/about',
      '/booksrightsholders',
      '/books?.*zoom=1.*',
      '/books?.*zoom=5.*',
      '/books/content?.*zoom=1.*',
      '/books/content?.*zoom=5.*',
      '/ebooks?.*zoom=1.*',
      '/ebooks?.*zoom=5.*',
      '/citations?user=',
      '/citations?view_op=new_profile',
      '/citations?view_op=top_venues',
      '/scholar_share',
      '/maps?.*output=classic.*',
      '/maps?.*file=',
      '/maps/d/',
      '/maps/api/js',
      '/calendar$',
      '/calendar/about/',
      '/safebrowsing/diagnostic',
      '/safebrowsing/report_badware/',
      '/safebrowsing/report_error/',
      '/safebrowsing/report_phish/',
      '/profiles',
      '/s2/profiles',
      '/s2/oz',
      '/s2/photos',
      '/s2/search/social',
      '/s2/static',
      '/accounts/o8/id',
      '/alerts/manage',
      '/alerts/remove',
      '/alerts/$',
      '/searchhistory/',
      '/maps/reserve',
      '/maps/reserve/partners',
      '/finance',
      '/js/'
    ],
    disallow: [
      '/search',
      '/sdch',
      '/groups',
      '/index.html?',
      '/?',
      '/?hl=*&',
      '/?hl=*&*&gws_rd=ssl',
      '/imgres',
      '/u/',
      '/preferences',
      '/setprefs',
      '/default',
      '/m?',
      '/m/',
      '/wml?',
      '/wml/?',
      '/wml/search?',
      '/xhtml?',
      '/xhtml/?',
      '/xhtml/search?',
      '/xml?',
      '/imode?',
      '/imode/?',
      '/imode/search?',
      '/jsky?',
      '/jsky/?',
      '/jsky/search?',
      '/pda?',
      '/pda/?',
      '/pda/search?',
      '/sprint_xhtml',
      '/sprint_wml',
      '/pqa',
      '/palm',
      '/gwt/',
      '/purchases',
      '/local?',
      '/local_url',
      '/shihui?',
      '/shihui/',
      '/products?',
      '/product_',
      '/products_',
      '/products;',
      '/print',
      '/books/',
      '/bkshp?*q=*',
      '/books?*q=*',
      '/books?*output=*',
      '/books?*pg=*',
      '/books?*jtp=*',
      '/books?*jscmd=*',
      '/books?*buy=*',
      '/books?*zoom=*',
      '/ebooks/',
      '/ebooks?*q=*',
      '/ebooks?*output=*',
      '/ebooks?*pg=*',
      '/ebooks?*jscmd=*',
      '/ebooks?*buy=*',
      '/ebooks?*zoom=*',
      '/patents?',
      '/patents/download/',
      '/patents/pdf/',
      '/patents/related/',
      '/scholar',
      '/citations?',
      '/citations?*cstart=',
      '/s?',
      '/maps?',
      '/mapstt?',
      '/mapslt?',
      '/maps/stk/',
      '/maps/br?',
      '/mapabcpoi?',
      '/maphp?',
      '/mapprint?',
      '/maps/api/js/',
      '/maps/api/place/js/',
      '/maps/api/staticmap',
      '/maps/api/streetview',
      '/maps/_/sw/manifest.json',
      '/mld?',
      '/staticmap?',
      '/maps/preview',
      '/maps/place',
      '/maps/timeline/',
      '/help/maps/streetview/partners/welcome/',
      '/help/maps/indoormaps/partners/',
      '/lochp?',
      '/center',
      '/ie?',
      '/blogsearch/',
      '/blogsearch_feeds',
      '/advanced_blog_search',
      '/uds/',
      '/chart?',
      '/transit?',
      '/calendar/',
      '/cl2/feeds/',
      ... 116 more items
    ],
    host: '/hosted/images/'
  },
  'AdsBot-Google': {
    allow: [ '/maps/api/js' ],
    disallow: [
      '/maps/api/js/',
      '/maps/api/place/js/',
      '/maps/api/staticmap',
      '/maps/api/streetview'
    ]
  },
  Twitterbot: { allow: [ '/imgres' ], disallow: [] },
  facebookexternalhit: { allow: [ '/imgres' ], disallow: [] }
}

whatToFollow()

Determines which URLs (links) to follow. Special character * from the robots.txt is replaced with a regular expression .*
The relative URLs found in robots.txt are converted to absolute.
const follow_urls = robotsTxt.whatToFollow();
https://www.google.com/robots.txt
--------------------------------------

follow_urls::  [ 'https://www.google.com/maps/api/js' ]

whatToUnollow()

Determines which URLs (links) NOT to follow. Special character * from the robots.txt is replaced with a regular expression .*
The relative URLs found in robots.txt are converted to absolute.
const unfollow_urls = robotsTxt.whatTounfollow();
https://www.google.com/robots.txt
--------------------------------------

unfollow_urls::  [
  'https://www.google.com/maps/api/js/',
  'https://www.google.com/maps/api/place/js/',
  'https://www.google.com/maps/api/staticmap',
  'https://www.google.com/maps/api/streetview'
]