Home
       eomyidae - eomyidae - a gopher crawler software
  HTML git clone git://bitreich.org/eomyidae
   DIR Log
   DIR Files
   DIR Refs
   DIR Tags
   DIR README
   DIR LICENSE
       ---
       eomyidae (15243B)
       ---
            1 #!/usr/bin/env python
            2 # coding=utf-8
            3 #
            4 # See the LICENSE file for details.
            5 #
            6 
            7 import os
            8 import sys
            9 import getopt
           10 import urllib.parse
           11 import socket
           12 import io
           13 import pickle
           14 import time
           15 import hashlib
           16 import errno
           17 import random
           18 import operator
           19 import math
           20 from multiprocessing import Pool
           21 from datetime import datetime
           22 from datetime import timedelta
           23 
           24 def parseuri(uri):
           25         urls = urllib.parse.urlparse(uri, allow_fragments=False)
           26         if ":" in urls.netloc:
           27                 (host, port) = urls.netloc.split(":")[:2]
           28         else:
           29                 host = urls.netloc
           30                 port = 70
           31 
           32         mtype = "1"
           33         if len(urls.path) > 1:
           34                 mtype = urls.path[1]
           35 
           36         if len(urls.path) > 2:
           37                 if len(urls.query) > 0:
           38                         selector = "%s?%s" % (urls.path[2:], urls.query)
           39                 else:
           40                         selector = urls.path[2:]
           41         else:
           42                 selector = ""
           43 
           44         return (host, port, mtype, selector) 
           45 
           46 def poolgopher(req):
           47         data = gopher(req[0], req[1], req[2], req[3])
           48         req.append(data)
           49         return req 
           50 
           51 def gopher(uri=None, host=None, port=70, selector=""):
           52         #print("gopher(uri = %s, host = %s, port = %d, selector = %s)" % \
           53         #                (uri, host, port, selector))
           54         if uri != None:
           55                 (host, port, mtype, selector) = parseuri(uri)
           56                 port = int(port)
           57 
           58         s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
           59         s.settimeout(20)
           60         try:
           61                 s.connect((host, port))
           62         except socket.gaierror:
           63                 return ""
           64         except socket.timeout:
           65                 return ""
           66         except TimeoutError:
           67                 return ""
           68         except ConnectionResetError:
           69                 return ""
           70         except OverflowError:
           71                 return ""
           72         except OSError as e:
           73                 # No route to host.
           74                 if e.errno == 113:
           75                         return ""
           76 
           77         try:
           78                 s.send(("%s\r\n" % (selector)).encode("utf-8"))
           79         except BrokenPipeError:
           80                 return ""
           81 
           82         fd = s.makefile("b")
           83         try:
           84                 data = fd.read()
           85         except socket.timeout:
           86                 fd.close()
           87                 return ""
           88         except ConnectionResetError:
           89                 fd.close()
           90                 return ""
           91         fd.close()
           92 
           93         try:
           94                 content = data.decode(errors='replace')
           95         except UnicodeDecodeError:
           96                 content = data.decode("iso-8859-1")
           97 
           98         return content
           99 
          100 def parsemenu(data):
          101         menu = []
          102         lines = data.split("\n")
          103         for line in lines:
          104                 line = line.strip()
          105                 if len(line) < 1:
          106                         continue
          107 
          108                 mtype = line[0]
          109 
          110                 # Last entry
          111                 if mtype == ".":
          112                         break
          113 
          114                 elements = line[1:].split("\t")
          115                 if len(elements) < 4:
          116                         continue
          117                 (description, selector, host, port) = elements[:4]
          118                 menu.append([mtype, description, selector, host, port])
          119 
          120         return menu
          121 
          122 def menu2text(menu):
          123         text = ""
          124         for entry in menu:
          125                 if type(entry[1]) != str:
          126                                 continue
          127 
          128                 text += "%s\n" % (entry[1])
          129         
          130         return text
          131 
          132 ## Robots.txt
          133 # https://en.wikipedia.org/wiki/Robots.txt
          134 # # Comment
          135 # User-agent: somebot
          136 # Disallow: /path
          137 # Allow: /path
          138 # Crawl-delay: seconds
          139 def parserobots(data):
          140         robots = []
          141         lines = data.split("\n")
          142         for line in lines:
          143                 line = line.strip()
          144                 if "#" in line:
          145                         (line, comment) = line.split("#", 1)
          146                 if len(line) < 0:
          147                         # Empty line, needed for bot-specific rules.
          148                         robots.append(["",""])
          149                         continue
          150                 if not ":" in line:
          151                         continue
          152 
          153                 (header, value) = line.strip().split(":", 1)
          154                 value = value.strip().lower()
          155                 header = header.strip().lower()
          156                 robots.append([header, value])
          157         return robots
          158 
          159 def adaptrobots(robotsdata):
          160         filterlines = {}
          161         robotslines = parserobots(robotsdata)
          162         i = 0
          163 
          164         allowlines = []
          165         disallowlines = []
          166         otherlines = []
          167         iseomyidae = False
          168         while i < len(robotslines):
          169                 header = robotslines[i][0].lower()
          170                 value = robotslines[i][1]
          171                 if header == "user-agent":
          172                         ua = value.split("/")
          173                         if ua[0] == "eomyidae" or ua[0] == "*":
          174                                 iseomyidae = 1
          175                         else:
          176                                 iseomyidae = 0
          177                 elif header == "allow" and iseomyidae == True:
          178                         allowlines.append(value)
          179                 elif header == "disallow" and iseomyidae == True:
          180                         disallowlines.append(value)
          181                 elif header == "":
          182                         iseomyidae = False
          183                 else:
          184                         if iseomyidae == True:
          185                                 otherlines.append([header, value])
          186                 i += 1
          187 
          188         filterlines["allow"] = allowlines
          189         filterlines["disallow"] = disallowlines
          190         filterlines["other"] = otherlines
          191         if len(allowlines) > 0 or len(disallowlines) > 0 \
          192                         or len(otherlines) > 0:
          193                 filterlines["empty"] = False
          194         else:
          195                 filterlines["empty"] = True
          196         
          197         return filterlines
          198 
          199 def mkpath(cachepath):
          200         try:
          201                 os.makedirs(cachepath)
          202         except OSError as e:
          203                 if e.errno != errno.EEXIST:
          204                         raise
          205 
          206 def mkopen(cachefile):
          207         if not os.path.exists(cachefile):
          208                 fd = open(cachefile, "xb")
          209         else:
          210                 fd = open(cachefile, "wb")
          211         return fd
          212 
          213 def informserveradmin(uri, host=None, port=70):
          214         if host == None:
          215                 (host, port, mtype, selector) = parseuri(uri)
          216                 port = int(port)
          217 
          218         # We are nice and inform before every robots.txt, how to contact us.
          219         gopher(host=host, port=port, selector="This is eomyidae, your "
          220                         "friendly crawler. See "
          221                         "gopher://gopherproject.org/1/eomyidae for "
          222                         "more info. Have a nice day!")
          223 
          224 def cacherobots(cachedir, uri, host=None, port=70, force=False, \
          225                 filtercache=None):
          226         if host == None:
          227                 (host, port, mtype, selector) = parseuri(uri)
          228                 port = int(port)
          229 
          230         if filtercache != None and host in filtercache:
          231                 #print("Got filterlines from memory filtercache.")
          232                 return filtercache[host]
          233 
          234         print("Getting robots for %s:%d" % (host, port))
          235 
          236         cachepath = "%s/%s:%d" % (cachedir, host, port)
          237         mkpath(cachepath)
          238 
          239         cacherobotstxt = "%s/robots.txt" % (cachepath)
          240         cacherobotspickle = "%s/robots.pickle" % (cachepath)
          241         filterlines = {}
          242         if not os.path.exists(cacherobotstxt) or force == True:
          243                 # Be nice.
          244                 informserveradmin(uri=uri, host=host, port=port)
          245 
          246                 robotsdata = gopher(host=host, port=port, selector="/robots.txt")
          247                 print("Got new robots.txt.")
          248                 print(robotsdata)
          249                 robotstxtfd = mkopen(cacherobotstxt)
          250                 robotstxtfd.write(robotsdata.encode())
          251                 robotstxtfd.close()
          252 
          253                 filterlines = adaptrobots(robotsdata)
          254                 # Do not store if there is nothing, so we save I/O later.
          255                 if filterlines["empty"] == False:
          256                         print("Storing filterlines.")
          257                         storelistdb(cacherobotspickle, filterlines)
          258 
          259         else:
          260                 if os.path.exists(cacherobotspickle):
          261                         #print("Loading filterlines from cache.")
          262                         filterlines = loadlistdb(cacherobotspickle)
          263                 else:
          264                         #print("No filterlines available in cache.")
          265                         filterlines["empty"] = True
          266 
          267         #print(filterlines)
          268         if filtercache != None:
          269                 filtercache[host] = filterlines
          270 
          271         return filterlines
          272 
          273 def selectorisallowed(filterlines, selector):
          274         if filterlines["empty"] == True:
          275                 return True
          276 
          277         def robotsmatch(pattern, selector):
          278                 #print("pattern = %s, selector = %s" % (pattern, selector))
          279                 if pattern == '*':
          280                         #print("Just start match.")
          281                         return True
          282                 elif pattern[0] == '*':
          283                         #print("Begins with star.")
          284                         if pattern[-1] == '*':
          285                                 #print("Begins and ends with star.")
          286                                 if pattern[1:-1] in selector:
          287                                         #print("Matches.")
          288                                         return True
          289                                 else:
          290                                         return False
          291                         else:
          292                                 return selector.endswith(pattern[1:])
          293                 elif pattern[-1] == '*':
          294                         #print("Ends with star.")
          295                         return selector.startswith(pattern[:-1])
          296                 else:
          297                         return selector.startswith(pattern)
          298 
          299         isallowed = True
          300         for line in filterlines["disallow"]:
          301                 # TODO: Should this be match everything?
          302                 if len(line) == 0:
          303                         continue
          304                 if robotsmatch(line, selector) == True:
          305                         #print("isallowed = False")
          306                         isallowed = False
          307         for line in filterlines["allow"]:
          308                 # TODO: Should this be match everything?
          309                 if len(line) == 0:
          310                         continue
          311                 if robotsmatch(line, selector) == True:
          312                         #print("isallowed = True")
          313                         isallowed = True
          314 
          315         #print("isallowed = %d" % (isallowed))
          316         return isallowed
          317 
          318 def loadselectorstxt(filename):
          319         selectors = []
          320 
          321         if os.path.exists(filename):
          322                 fd = open(filename, "r")
          323                 for line in fd:
          324                         fields = line.split("|")
          325                         selectors.append(fields)
          326                 fd.close()
          327         
          328         return selectors
          329 
          330 def loadlist(filename):
          331         listelems = []
          332 
          333         if os.path.exists(filename):
          334                 fd = open(filename, "r")
          335                 for line in fd:
          336                         line = line.strip()
          337                         if len(line) == 0:
          338                                 continue
          339                         if line[0] == "#":
          340                                 continue
          341                         listelems.append(line)
          342                 fd.close()
          343         
          344         return listelems
          345 
          346 def loadlistdb(filename):
          347         listelems = []
          348 
          349         if os.path.exists(filename):
          350                 fd = open(filename, "rb")
          351                 try:
          352                         listelems = pickle.load(fd)
          353                 except EOFError:
          354                         return []
          355                 fd.close()
          356         
          357         return listelems
          358 
          359 def storelistdb(filename, listelems):
          360         fd = mkopen(filename)
          361         pickle.dump(listelems, fd)
          362         fd.close()
          363 
          364 def storerawdata(cachedir, uri, data, host=None, port=70):
          365         if host == None:
          366                 (host, port, mtype, selector) = parseuri(uri)
          367                 port = int(port)
          368 
          369         cachepath = "%s/%s:%s" % (cachedir, host, port)
          370         mkpath(cachepath)
          371 
          372         m = hashlib.sha256()
          373         m.update(uri.encode())
          374         urihash = m.hexdigest()
          375 
          376         cachepath = "%s/%s.menu" % (cachepath, urihash)
          377         fd = mkopen(cachepath)
          378         #print("Storing %s at %s" % (uri, cachepath))
          379         fd.write(("%s\n" % (uri)).encode())
          380         fd.write(data.encode())
          381         fd.close()
          382 
          383 def usage(app):
          384         app = os.path.basename(app)
          385         print("usage: %s [-hor] [-b base] [-f blocklist] [-w n] [starturl]" % (app), file=sys.stderr)
          386         sys.exit(1)
          387 
          388 def main(args):
          389         try:
          390                 opts, largs = getopt.getopt(args[1:], "hb:f:ow:r")
          391         except getopt.GetoptError as err:
          392                 print(str(err))
          393                 usage(args[0])
          394 
          395         blocklistfile = None
          396         blocklist = []
          397 
          398         base = "."
          399         starturi = None
          400         workernum = 1
          401         robotscache = {}
          402         forcehostscount = False
          403         for o, a in opts:
          404                 if o == "-h":
          405                         usage(args[0])
          406                 elif o == "-b":
          407                         base = a
          408                 elif o == "-f":
          409                         blocklistfile = a
          410                         blocklist = loadlist(blocklistfile)
          411                         print("blocklist: %s" % (blocklist))
          412                 elif o == "-o":
          413                         forcehostscount = True
          414                 elif o == "-r":
          415                         # Do not cache robots.txt in memory.
          416                         robotscache = None
          417                 elif o == "-w":
          418                         try:
          419                                 workernum = int(a)
          420                         except ValueError:
          421                                 workernum = 1
          422                 else:
          423                         assert False, "unhandled option"
          424 
          425         os.chdir(base)
          426         cachedir = "%s/cache" % (base)
          427 
          428         if len(largs) > 0:
          429                 starturi = largs[0]
          430 
          431         knownuris = loadlistdb("knownuris.pickle")
          432         if knownuris == []:
          433                 knownuris = {}
          434         lastlenknownuris = len(knownuris)
          435 
          436         def isblocked(uri):
          437                 for rule in blocklist:
          438                         if uri.startswith(rule):
          439                                 return True
          440                 return False
          441 
          442         def addhostscount(host):
          443                 if host in hostscount:
          444                         hostscount[host] += 1
          445                 else:
          446                         hostscount[host] = 1
          447 
          448         def subhostscount(host):
          449                 if host in hostscount:
          450                         hostscount[host] -= 1
          451                         if hostscount[host] <= 0:
          452                                 del hostscount[host]
          453 
          454         def addhostscache(uri, host=None, port=70, selector="/"):
          455                 if uri != None and host == None:
          456                         (host, port, mtype, selector) = parseuri(uri)
          457                         port = int(port)
          458                 else:
          459                         try:
          460                                 port = int(port)
          461                         except ValueError:
          462                                 return
          463 
          464                 if uri in knownuris:
          465                         print("ignored for queue: %s" % (uri))
          466                         return
          467                 if host == "":
          468                         print("ignored for queue: %s" % (uri))
          469                         return
          470                 if isblocked(uri):
          471                         print("blocked by filters: %s" % (uri))
          472                         return
          473 
          474                 addhostscount(host)
          475 
          476                 if not host in hostscache:
          477                         hostscache[host] = {}
          478                 if not "queue" in hostscache[host]:
          479                         hostscache[host]["queue"] = {}
          480 
          481                 filterrules = cacherobots(cachedir, uri, \
          482                                 host=host, \
          483                                 port=port, \
          484                                 filtercache=robotscache)
          485                 if selectorisallowed(filterrules, selector) == True:
          486                         hostscache[host]["queue"][uri] = None
          487                         print("pushed to queue: %s" % (uri))
          488                 else:
          489                         pass
          490                         print("blocked by robots: %s" % (uri))
          491 
          492         def getqueuelen():
          493                 queuelen = 0
          494                 for host in hostscache:
          495                         queuelen += len(hostscache[host]["queue"])
          496                 return queuelen
          497 
          498         hostscache = loadlistdb("hostscache.pickle")
          499         if hostscache == []:
          500                 hostscache = {}
          501         hostscount = loadlistdb("hostscount.pickle")
          502         if hostscount == [] or forcehostscount == True:
          503                 hostscount = {}
          504                 for host in list(hostscache.keys()):
          505                         print("host = %s, queuelen = %d" \
          506                                         % (host, \
          507                                            len(hostscache[host]["queue"])))
          508                         if len(hostscache[host]["queue"]) == 0:
          509                                 del hostscache[host]
          510                                 continue
          511                         for uri in hostscache[host]["queue"]:
          512                                 (host, port, mtype, selector) = parseuri(uri)
          513                                 addhostscount(host)
          514 
          515         def storestate():
          516                 if blocklistfile != None:
          517                         blocklist = loadlist(blocklistfile)
          518                         if len(blocklist) > 0:
          519                                 print("blocklist: %s" % (blocklist))
          520                 print("################## Storing state to disc.")
          521                 storelistdb("knownuris.pickle", knownuris)
          522                 storelistdb("hostscache.pickle", hostscache)
          523                 storelistdb("hostscount.pickle", hostscount)
          524                 print("################## Storing state to disc done.")
          525 
          526         jobs = []
          527         if starturi != None:
          528                 #print("starturi = %s" % (starturi))
          529                 if not isblocked(starturi):
          530                         (starthost, startport, startmtype, startselector) = parseuri(starturi)
          531                         addhostscache(starturi, \
          532                                         selector=startselector, \
          533                                         host=starthost, \
          534                                         port=startport)
          535                         try:
          536                                 jobs.append([starturi, starthost, int(startport), startselector])
          537                         except ValueError:
          538                                 # Please fix your URI.
          539                                 pass
          540 
          541         # Store state keeper.
          542         startnow = datetime.now()
          543         storedelta = timedelta(seconds=10) # 30 seconds
          544 
          545         lastlenknownhosts = len(hostscache)
          546         lastlenuriqueue = getqueuelen()
          547         while lastlenuriqueue > 0:
          548                 if len(jobs) < workernum:
          549                         for host in list(hostscache.keys()):
          550                                 if len(hostscache[host]["queue"]) == 0:
          551                                         del hostscache[host]
          552                                         if host in hostscount:
          553                                                 del hostscount[host]
          554 
          555                         selhosts = sorted(hostscount.items(), \
          556                                         key=operator.itemgetter(1))[:workernum*2]
          557 
          558                         # Give hosts with many selectors more jobs.
          559                         hostjobs = {}
          560                         for selhost in selhosts:
          561                                 # 10 ** x
          562                                 hostjobs[selhost[0]] = \
          563                                         math.floor(math.log10(selhost[1]))
          564                                 if hostjobs[selhost[0]] == 0:
          565                                         hostjobs[selhost[0]] = 1
          566                         print("Queue Status: %s" % (hostjobs))
          567 
          568                         for selhost in selhosts:
          569                                 selhost = selhost[0]
          570                                 seluris = hostscache[selhost]["queue"]
          571                                 while hostjobs[selhost] > 0:
          572                                         if len(seluris) == 0:
          573                                                 break
          574                                         jobitem = seluris.popitem()
          575                                         if isblocked(jobitem[0]):
          576                                                 continue
          577                                         (host, port, mtype, selector) = parseuri(jobitem[0])
          578                                         job = [jobitem[0], host, port, selector]
          579                                         if job not in jobs:
          580                                                 jobs.append([jobitem[0], host, port, selector])
          581                                         hostjobs[selhost] -= 1
          582 
          583                 print("Getting %d jobs." % (len(jobs)))
          584 
          585                 dataresults = []
          586                 with Pool(processes=workernum) as pool:
          587                         dataresults = pool.map(poolgopher, jobs)
          588                         #data = gopher(host=host, port=port, selector=selector)
          589                 jobs = []
          590 
          591                 for dataresult in dataresults:
          592                         (cururi, host, port, selector, data) = dataresult
          593                         subhostscount(host)
          594                         storerawdata(cachedir, cururi, data, host=host, port=port)
          595                         menudata = parsemenu(data)
          596                         #print(menudata)
          597                         for mi in menudata:
          598                                 # Only menus so far.
          599                                 if mi[0] == "1":
          600                                         # Fix menu items with ports in hosts. 
          601                                         if ":" in mi[3]:
          602                                                 mi[3] = mi[3].split(":")[0]
          603 
          604                                         guri =  "gopher://%s:%s/%s%s" % \
          605                                                         (mi[3], mi[4], mi[0], mi[2])
          606 
          607                                         addhostscache(guri, host=mi[3], \
          608                                                         port=mi[4], \
          609                                                         selector=mi[2])
          610 
          611                         print("Uri %s done." % (cururi))
          612                         knownuris[cururi] = None
          613 
          614                 lenuriqueue = getqueuelen()
          615                 lenknownuris = len(knownuris)
          616                 lenknownhosts = len(hostscache)
          617                 print("> queue hosts = %d (%d) %s" % \
          618                                 (lenknownhosts, lenknownhosts -
          619                                         lastlenknownhosts, hostscache.keys()))
          620                 print("> uri queue len = %d (%d)" % \
          621                                 (lenuriqueue, lenuriqueue - lastlenuriqueue))
          622                 print("> visited uris = %d (%d)" % \
          623                                 (lenknownuris, lenknownuris - lastlenknownuris))
          624                 lastlenknownuris = lenknownuris
          625                 lastlenuriqueue = lenuriqueue
          626                 lastlenknownhosts = lenknownhosts
          627 
          628                 # TODO: Remove after debugging
          629                 nowdelta = datetime.now() - startnow
          630                 if nowdelta >= storedelta:
          631                         storestate()
          632                         startnow = datetime.now()
          633 
          634                 time.sleep(0.2) # don't be too harsh on servers
          635 
          636                 #break #oneshot
          637 
          638         # Save at end of even single shot.
          639         storestate()
          640 
          641         return 0
          642 
          643 if __name__ == "__main__":
          644         sys.exit(main(sys.argv))
          645