Questo sito utilizza cookies solo per scopi di autenticazione sul sito e nient'altro. Nessuna informazione personale viene tracciata. Leggi l'informativa sui cookies.
Username: Password: oppure
Python - web crawler
Forum - Python - web crawler

Avatar
()
Newbie


Messaggi:
Iscritto:

Segnala al moderatore
Postato alle 1:00
Giovedė, 01/01/1970
ciao a tutti! ho scritto una base per un web spider...
eccola qui:
Codice sorgente - presumibilmente Python

  1. #!/usr/bin/python
  2. import urllib
  3. from sgmllib import SGMLParser
  4. import sys
  5. import re
  6. import os
  7. import shutil
  8. class list_urls(SGMLParser):
  9.    def reset(self):
  10.       SGMLParser.reset(self)
  11.       self.urls = []
  12.    def start_a(self, av):
  13.       links = [a_tag for href, a_tag in av if href == 'href']
  14.       if links:
  15.          self.urls.extend(links)  
  16. def url_scanner(url):
  17.    conn = urllib.urlopen(url)
  18.    scan = list_urls()
  19.    scan.feed(conn.read())
  20.    conn.close()
  21.    scan.close()
  22.    n = re.sub("http://", "", url)
  23.    name = n.split(".")
  24.    log_title = name[0] + name[1] + "_urls.log"
  25.    print log_title
  26.    log_file = open("logs/" + log_title, "w")
  27.    for url in scan.urls:
  28.       print url
  29.       log_file.write(url + "\n")
  30.    log_file.close()
  31.    print "\n\nscanning finished and logged."
  32. def html(url):
  33.    go = urllib.urlopen(url)
  34.    s = go.read()
  35.    go.close()
  36.    n = re.sub("http://", "", url)
  37.    name = n.split(".")
  38.    log_title = name[0] + name[1] + "_html_source_code.log"
  39.    log_file = open("logs/" + log_title, "w")
  40.    log_file.write(s)
  41.    log_file.close()
  42.    print s
  43.    print "\n\npage saved."
  44. def autoscan(f):
  45.    fil = open(f, "r")
  46.    for lines in fil.readlines():
  47.       print "\n\n"+lines+"-----------------------------------------\n"
  48.       conn = urllib.urlopen(lines)
  49.       scan = list_urls()
  50.       scan.feed(conn.read())
  51.       conn.close()
  52.       scan.close()
  53.       fil.close()
  54.       n = re.sub("http://", "", lines)
  55.       name = n.split(".")
  56.       log_title = name[0] + name[1] + "_urls.log"
  57.       log_file = open("logs/" + log_title, "w")
  58.       for lines in scan.urls:
  59.          print "\n"+lines
  60.          log_file.write(lines + "\n")
  61.       print "\n\nscanning finished and logged."
  62. def help():
  63.    try:
  64.       version = open("version", "r")
  65.       print "\n"+version.read(os.path.getsize("version"))
  66.    except:
  67.       pass
  68.    print "USAGE:\t\tpython " + sys.argv[0] + " [url\logfile] [option]"
  69.    print "\nOPTIONS:"
  70.    print "\t\t-url\turl scanner"
  71.    print "\t\t-sc\thtml source code"
  72.    print "\t\t-auto\tauto scan"
  73.    print "\t\t-cl\tto clear the log files"
  74.    sys.exit()
  75. def remove_logs():
  76.    try:
  77.       shutil.rmtree('logs')
  78.       print "logs removed"
  79.    except:
  80.       print "cannot remove log files. maybe they've been already removed."
  81. def main():
  82.    try:
  83.       URL = sys.argv[1]
  84.       OPTION = sys.argv[2]
  85.    except:
  86.       help()
  87.    log_dir = "logs"
  88.    if not os.path.exists(log_dir):
  89.       os.mkdir(log_dir, 0777)
  90.    if(sys.argv[2] == "-u"):
  91.       print "scanning " + sys.argv[1] + " ...\n\n"
  92.       url_scanner(sys.argv[1])
  93.    elif(sys.argv[2] == "-sc"):
  94.       print "retrieving " + sys.argv[1] + " HTML source code ...\n\n"
  95.       html(sys.argv[1])
  96.    elif(sys.argv[2] == "-auto"):
  97.       print "scanning " + sys.argv[1] + " ...\n\n"
  98.       autoscan(sys.argv[1])
  99.    else:
  100.       help()
  101. if __name__ == "__main__":
  102.    try:
  103.       URL = sys.argv[1]
  104.    except:
  105.       help()
  106.    if(sys.argv[1] == "-cl"):
  107.       remove_logs()
  108.       sys.exit()
  109.    main()


ciao!

PM Quote