#!/usr/bin/env python import sys if sys.version_info[0] < 3: from HTMLParser import HTMLParser from urllib import urlopen, urlretrieve, urlencode def get_encoding(info): return info.getparam('charset') else: from html.parser import HTMLParser from urllib.request import urlopen, urlretrieve from urllib.parse import urlencode def get_encoding(info): return info.get_content_charset('ascii') import zipfile import subprocess def output(fo, fmt, *args, **kargs): if not args: args = kargs fo.write((fmt % args) + '\n') def echo(fmt, *args, **kargs): output(sys.stdout, fmt, *args, **kargs) def error(fmt, *args, **kargs): output(sys.stderr, fmt, *args, **kargs) class SubDivXQuery: def __init__(self, to_search, page_number): self.host = "www.subdivx.com" self.page = "/index.php" self.down_page = "/bajar.php" self.query = dict( buscar = to_search, pg = page_number, accion = 5, masdesc = '', subtitulos = 1, realiza_b = 1, ) @property def url(self): return 'http://%s%s?%s' % (self.host, self.page, urlencode(self.query)) @property def page_uri(self): return self.page + '?' + urlencode(self.query) @property def down_uri(self): return 'http://' + self.host + self.down_page class SubDivXHTMLParser(HTMLParser): IDLE = 1 HEADER = 2 def __init__(self, down_uri): HTMLParser.__init__(self) self.down_uri = down_uri self.depth = 0 self.parsing = False self.subs = [] self.attr = None self.attr_depth = 0 self.cur = None self.in_script_style = False def handle_starttag(self, tag, attrs): attrs = dict(attrs) if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador': self.cur = dict() self.subs.append(self.cur) self.parsing = True if not self.parsing: return if tag == 'script' or tag == 'style': self.in_script_style = True return if tag == 'div': if attrs.get('id') == 'buscador_detalle': self.parsing = True elif attrs.get('id') == 'buscador_detalle_sub': self.attr = 'desc' self.attr_depth = self.depth + 1 self.cur[self.attr] = '' elif tag == 'a': if attrs.get('class') == 'titulo_menu_izq': self.attr = 'titulo' self.attr_depth = self.depth + 1 self.cur[self.attr] = '' elif attrs.get('href', '').startswith(self.down_uri): self.cur['url'] = attrs['href'] # br are usually not closed, so ignore them in depth calculation if self.parsing and tag != 'br': self.depth += 1 def handle_endtag(self, tag): if self.parsing: if tag == 'script' or tag == 'style': self.in_script_style = False return if self.depth == self.attr_depth: self.attr = None self.attr_depth = 0 # see comment in handle_starttag() if tag != 'br': self.depth -= 1 if self.depth == 0: self.parsing = False def handle_data(self, data): if not self.parsing: return data = data.strip() # Hack to handle comments in