Home
       Optimize savehostscache. - eomyidae - a gopher crawler software
  HTML git clone git://bitreich.org/eomyidae
   DIR Log
   DIR Files
   DIR Refs
   DIR Tags
   DIR README
   DIR LICENSE
       ---
   DIR commit 29cd7839e600acdd21378256d73b4703f799f04a
   DIR parent 0dac4a637d7e25983b563286bb0539d53ddf8d3e
  HTML Author: Christoph Lohmann <20h@r-36.net>
       Date:   Mon, 12 Aug 2019 11:48:12 +0200
       
       Optimize savehostscache.
       
       Diffstat:
         M eomyidae                            |      49 ++++++++++++++++++++-----------
       
       1 file changed, 32 insertions(+), 17 deletions(-)
       ---
   DIR diff --git a/eomyidae b/eomyidae
       @@ -429,6 +429,8 @@ def main(args):
                        starturi = largs[0]
        
                knownuris = loadlistdb("knownuris.pickle")
       +        if knownuris == []:
       +                knownuris = {}
                lastlenknownuris = len(knownuris)
        
                def isblocked(uri):
       @@ -449,38 +451,43 @@ def main(args):
                                if hostscount[host] <= 0:
                                        del hostscount[host]
        
       -        def addhostscache(host, uri, port=70):
       +        def addhostscache(uri, host=None, port=70, selector="/"):
       +                if uri != None and host == None:
       +                        (host, port, mtype, selector) = parseuri(uri)
       +                        port = int(port)
       +                else:
       +                        try:
       +                                port = int(port)
       +                        except ValueError:
       +                                return
       +
                        if uri in knownuris:
       -                        #print("ignored for queue: %s" % (uri))
       +                        print("ignored for queue: %s" % (uri))
                                return
                        if host == "":
       -                        #print("ignored for queue: %s" % (uri))
       +                        print("ignored for queue: %s" % (uri))
                                return
                        if isblocked(uri):
                                print("blocked by filters: %s" % (uri))
                                return
        
       -                try:
       -                        port = int(port)
       -                except ValueError:
       -                        return
       -
                        addhostscount(host)
        
       +                if not host in hostscache:
       +                        hostscache[host] = {}
       +                if not "queue" in hostscache[host]:
       +                        hostscache[host]["queue"] = {}
       +
                        filterrules = cacherobots(cachedir, uri, \
                                        host=host, \
                                        port=port, \
                                        filtercache=robotscache)
                        if selectorisallowed(filterrules, selector) == True:
       -                        if not host in hostscache:
       -                                hostscache[host] = {}
       -                        if not "queue" in hostscache[host]:
       -                                hostscache[host]["queue"] = {}
                                hostscache[host]["queue"][uri] = None
       -                        #print("pushed to queue: %s" % (uri))
       +                        print("pushed to queue: %s" % (uri))
                        else:
                                pass
       -                        #print("blocked by robots: %s" % (uri))
       +                        print("blocked by robots: %s" % (uri))
        
                def getqueuelen():
                        queuelen = 0
       @@ -518,9 +525,13 @@ def main(args):
        
                jobs = []
                if starturi != None:
       +                #print("starturi = %s" % (starturi))
                        if not isblocked(starturi):
                                (starthost, startport, startmtype, startselector) = parseuri(starturi)
       -                        addhostscache(hostscache, starthost, starturi)
       +                        addhostscache(starturi, \
       +                                        selector=startselector, \
       +                                        host=starthost, \
       +                                        port=startport)
                                try:
                                        jobs.append([starturi, starthost, int(startport), startselector])
                                except ValueError:
       @@ -564,7 +575,9 @@ def main(args):
                                                if isblocked(jobitem[0]):
                                                        continue
                                                (host, port, mtype, selector) = parseuri(jobitem[0])
       -                                        jobs.append([jobitem[0], host, port, selector])
       +                                        job = [jobitem[0], host, port, selector]
       +                                        if job not in jobs:
       +                                                jobs.append([jobitem[0], host, port, selector])
                                                hostjobs[selhost] -= 1
        
                        print("Getting %d jobs." % (len(jobs)))
       @@ -591,7 +604,9 @@ def main(args):
                                                guri =  "gopher://%s:%s/%s%s" % \
                                                                (mi[3], mi[4], mi[0], mi[2])
        
       -                                        addhostscache(mi[3], guri, port=mi[4])
       +                                        addhostscache(guri, host=mi[3], \
       +                                                        port=mi[4], \
       +                                                        selector=mi[2])
        
                                print("Uri %s done." % (cururi))
                                knownuris[cururi] = None