gogadmin
/
Arduino


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
							# usage:
#	fpgrabber.py -u [url to crawl] -d [output directory]
#	 recursively crawl the url and download fp files into the output directory.

# code borrowed from http://theanti9.wordpress.com/2009/02/14/python-web-crawler-in-less-than-50-lines/

import getopt, sys, os, os.path, re, urllib2, urlparse
    
def usage():
    print """
usage:
    fpgrabber.py -u [url to crawl] -d [output directory]
    recursively crawl the url and download fp files into the output directory.
    """
 
linkregex = re.compile('<a\s*href=[\"\'](.[^\"\']+)[\"\']')
redirectex = re.compile('content=[\"\']\d+;\s*URL=(.[^\"\']+)[\"\']')
tocrawl = []
crawled = set([])
outputdir = None
inputurl = None
 
def main():
    global inputurl, outputdir
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "hu:d:", ["help", "url", "directory"])
    except getopt.GetoptError, err:
        # print help information and exit:
        print str(err) # will print something like "option -a not recognized"
        usage()
        sys.exit(2)
    
    for o, a in opts:
        #print o
        #print a
        if o in ("-u", "--url"):
            inputurl = a
        elif o in ("-d", "--directory"):
            outputdir = a
        elif o in ("-h", "--help"):
            usage()
            sys.exit(2)
        else:
            assert False, "unhandled option"
    
    if(not(inputurl)):
        usage()
        sys.exit(2)

    if(not(outputdir)):
        usage()
        sys.exit(2)

    if not outputdir.endswith(':') and not os.path.exists(outputdir):
        os.mkdir(outputdir)
        
    if not inputurl.endswith("/"):
        #footprint site seems to require this
        inputurl = inputurl + "/"
       
    tocrawl.append(inputurl)
    crawl()
    
def savefp(url):
    print "   saving " + url
    filehandle = None
    try:
        filehandle = urllib2.urlopen(url)
    except Exception as inst:
        print inst
        pass
        
    if (not filehandle):
        print "unable to open 1 " + url
        return
    
    p = url.split('/')
    fn = p[len(p) - 1]
    fn = fn[:-3]
    fn = urllib2.unquote(fn)
    f = open(os.path.join(outputdir, fn),'w')
    if (not f):
        print "unable to save " + fn
        filehandle.close()
        return
        
    for lines in filehandle.readlines():
        f.write(lines)

    f.close()
    filehandle.close()       
    return

def crawl():
        
    while len(tocrawl) > 0:
        crawling = tocrawl.pop()
        print "crawling " + crawling
          
        try:
            response = urllib2.urlopen(crawling)
        except:
            print "failed to open " + crawling
            return
            
        #print "code " + str(response.getcode())
        #url = urlparse.urlparse(response.geturl())
        if crawling != response.geturl():
            #this never gets triggered by the footprint site, geturl doesn't seem to return redirects
            print "changed " + crawling + " " + response.geturl()
            
        #info = response.info()
        #print info
        
        msg = response.read()
        response.close()
        links = linkregex.findall(msg)
        if (len(links) == 0):
            if (msg.find("<meta") >= 0):
                links = redirectex.findall(msg)
                #print "     redirect "  + msg + " " + str(len(links))
            
        crawled.add(crawling)
        for link in (links.pop(0) for _ in xrange(len(links))):    
            newlink = urlparse.urljoin(crawling, urllib2.quote(link, ":/?%+"))
            #print "        link " + newlink + " " + link
            if (newlink.endswith("fp?dl")):
                savefp(newlink)
                continue
                
            if (newlink.find(inputurl) >= 0) and (newlink not in crawled):   
                tocrawl.append(newlink)
   
if __name__ == "__main__":
    main()