fpgrabber.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. # usage:
  2. # fpgrabber.py -u [url to crawl] -d [output directory]
  3. # recursively crawl the url and download fp files into the output directory.
  4. # code borrowed from http://theanti9.wordpress.com/2009/02/14/python-web-crawler-in-less-than-50-lines/
  5. import getopt, sys, os, os.path, re, urllib2, urlparse
  6. def usage():
  7. print """
  8. usage:
  9. fpgrabber.py -u [url to crawl] -d [output directory]
  10. recursively crawl the url and download fp files into the output directory.
  11. """
  12. linkregex = re.compile('<a\s*href=[\"\'](.[^\"\']+)[\"\']')
  13. redirectex = re.compile('content=[\"\']\d+;\s*URL=(.[^\"\']+)[\"\']')
  14. tocrawl = []
  15. crawled = set([])
  16. outputdir = None
  17. inputurl = None
  18. def main():
  19. global inputurl, outputdir
  20. try:
  21. opts, args = getopt.getopt(sys.argv[1:], "hu:d:", ["help", "url", "directory"])
  22. except getopt.GetoptError, err:
  23. # print help information and exit:
  24. print str(err) # will print something like "option -a not recognized"
  25. usage()
  26. sys.exit(2)
  27. for o, a in opts:
  28. #print o
  29. #print a
  30. if o in ("-u", "--url"):
  31. inputurl = a
  32. elif o in ("-d", "--directory"):
  33. outputdir = a
  34. elif o in ("-h", "--help"):
  35. usage()
  36. sys.exit(2)
  37. else:
  38. assert False, "unhandled option"
  39. if(not(inputurl)):
  40. usage()
  41. sys.exit(2)
  42. if(not(outputdir)):
  43. usage()
  44. sys.exit(2)
  45. if not outputdir.endswith(':') and not os.path.exists(outputdir):
  46. os.mkdir(outputdir)
  47. if not inputurl.endswith("/"):
  48. #footprint site seems to require this
  49. inputurl = inputurl + "/"
  50. tocrawl.append(inputurl)
  51. crawl()
  52. def savefp(url):
  53. print " saving " + url
  54. filehandle = None
  55. try:
  56. filehandle = urllib2.urlopen(url)
  57. except Exception as inst:
  58. print inst
  59. pass
  60. if (not filehandle):
  61. print "unable to open 1 " + url
  62. return
  63. p = url.split('/')
  64. fn = p[len(p) - 1]
  65. fn = fn[:-3]
  66. fn = urllib2.unquote(fn)
  67. f = open(os.path.join(outputdir, fn),'w')
  68. if (not f):
  69. print "unable to save " + fn
  70. filehandle.close()
  71. return
  72. for lines in filehandle.readlines():
  73. f.write(lines)
  74. f.close()
  75. filehandle.close()
  76. return
  77. def crawl():
  78. while len(tocrawl) > 0:
  79. crawling = tocrawl.pop()
  80. print "crawling " + crawling
  81. try:
  82. response = urllib2.urlopen(crawling)
  83. except:
  84. print "failed to open " + crawling
  85. return
  86. #print "code " + str(response.getcode())
  87. #url = urlparse.urlparse(response.geturl())
  88. if crawling != response.geturl():
  89. #this never gets triggered by the footprint site, geturl doesn't seem to return redirects
  90. print "changed " + crawling + " " + response.geturl()
  91. #info = response.info()
  92. #print info
  93. msg = response.read()
  94. response.close()
  95. links = linkregex.findall(msg)
  96. if (len(links) == 0):
  97. if (msg.find("<meta") >= 0):
  98. links = redirectex.findall(msg)
  99. #print " redirect " + msg + " " + str(len(links))
  100. crawled.add(crawling)
  101. for link in (links.pop(0) for _ in xrange(len(links))):
  102. newlink = urlparse.urljoin(crawling, urllib2.quote(link, ":/?%+"))
  103. #print " link " + newlink + " " + link
  104. if (newlink.endswith("fp?dl")):
  105. savefp(newlink)
  106. continue
  107. if (newlink.find(inputurl) >= 0) and (newlink not in crawled):
  108. tocrawl.append(newlink)
  109. if __name__ == "__main__":
  110. main()