curl-and-python

Function to retrieve mulltiple URLs asyncrously

From: gf gf <unknownsoldier93_at_yahoo.com>
Date: Mon, 7 Mar 2005 21:14:50 -0800 (PST)

Has anyone written a function to retrieve multiple
URLs asyncronously? I tried to hack the
retriever-multi.py to do so (see below), but am having
some trouble with it.

Anyway, here's my hack. Comments, feedback, and fixes
appreciated. Please cc me on all responses.

import sys
import pycurl

import cStringIO

def harvestURLs(urls, num_conn = 10):
        

        # We should ignore SIGPIPE when using pycurl.NOSIGNAL
- see
        # the libcurl tutorial for more info.
        try:
            import signal
            from signal import SIGPIPE, SIG_IGN
            signal.signal(signal.SIGPIPE, signal.SIG_IGN)
        except ImportError:
            pass
        
        # Make a queue with (url, filename) tuples
        queue = []
        for url in urls:
            url = url.strip()
            if not url or url[0] == "#":
                continue
            filename = "doc_%03d.dat" % (len(queue) + 1)
            queue.append((url, filename))
        
        
        # Check args
        assert queue, "no URLs given"
        num_urls = len(queue)
        num_conn = min(num_conn, num_urls)
        assert 1 <= num_conn <= 10000, "invalid number of
concurrent connections"
        print "PycURL %s (compiled against 0x%x)" %
(pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
        print "----- Getting", num_urls, "URLs using",
num_conn, "connections -----"
        
        
        # Pre-allocate a list of curl objects
        m = pycurl.CurlMulti()
        m.handles = []
        for i in range(num_conn):
                c = pycurl.Curl()
                c.fp = None
                c.setopt(pycurl.FOLLOWLOCATION, 1)
                c.setopt(pycurl.MAXREDIRS, 5)
                #c.setopt(pycurl.CONNECTTIMEOUT, 30)
                c.setopt(pycurl.CONNECTTIMEOUT, 15)
                #c.setopt(pycurl.TIMEOUT, 300)
                c.setopt(pycurl.TIMEOUT, 25)
                c.setopt(pycurl.NOSIGNAL, 1)
                m.handles.append(c)
        
        
        # Main loop
        freelist = m.handles[:]
        num_processed = 0
        r = []
        while num_processed < num_urls:
            # If there is an url to process and a free curl
object, add to multi stack
            while queue and freelist:
                url, filename = queue.pop(0)
                c = freelist.pop()
                #c.fp = open(filename, "wb")
                c.setopt(pycurl.URL, url)
                #c.setopt(pycurl.WRITEDATA, c.fp)
                m.add_handle(c)
                # store some info
                #c.filename = filename
                c.url = url
                c.res = cStringIO.StringIO()
                c.setopt(pycurl.WRITEFUNCTION, c.res.write)

            # Run the internal curl state machine for the
multi stack
            while 1:
                ret, num_handles = m.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break
            # Check for curl objects which have terminated,
and add them to the freelist
            while 1:
                num_q, ok_list, err_list = m.info_read()
                for c in ok_list:
                    #c.fp.close()
                    #c.fp = None
                    m.remove_handle(c)
                    #print "Success:", c.filename, c.url,
c.getinfo(pycurl.EFFECTIVE_URL)
                    print "Success:", c.url,
c.getinfo(pycurl.EFFECTIVE_URL)
                    freelist.append(c)
                for c, errno, errmsg in err_list:
                    #c.fp.close()
                    #c.fp = None
                    m.remove_handle(c)
                    #print "Failed: ", c.filename, c.url,
errno, errmsg
                    print "Failed: ", c.url, errno, errmsg
                    freelist.append(c)
                num_processed = num_processed + len(ok_list)
+ len(err_list)
                if num_q == 0:
                    break
            # Currently no more I/O is pending, could do
something in the meantime
            # (display a progress bar, etc.).
            # We just call select() to sleep until some more
data is available.
            #m.select()
                # I was getting stuckage, so let's try this
                print "%s URLs total. %s completed." % (num_urls,
num_processed)
                m.select(25)
        
        for c in m.handles:
                r.append((c.url, c.res.getvalue()))
                        
        
        # Cleanup
        for c in m.handles:
            #if c.fp is not None:
              # c.fp.close()
                #c.fp = None
            c.close()
        m.close()
        
        print "Returning %s URLS" % len(r)
        return r

        
                
__________________________________
Celebrate Yahoo!'s 10th Birthday!
Yahoo! Netrospective: 100 Moments of the Web
http://birthday.yahoo.com/netrospective/
_______________________________________________
http://cool.haxx.se/mailman/listinfo/curl-and-python
Received on 2005-03-08