X-Git-Url: https://git.llucax.com/software/subdivxget.git/blobdiff_plain/44c70b16fa26aad38244f816c2b0ad3c3a170ecc..HEAD:/subdivxget diff --git a/subdivxget b/subdivxget index 1aecc66..5997114 100755 --- a/subdivxget +++ b/subdivxget @@ -1,10 +1,36 @@ #!/usr/bin/env python import sys -import urllib +if sys.version_info[0] < 3: + from HTMLParser import HTMLParser + from urllib import urlopen, urlretrieve, urlencode + def get_encoding(info): + return info.getparam('charset') + +else: + from html.parser import HTMLParser + from urllib.request import urlopen, urlretrieve + from urllib.parse import urlencode + def get_encoding(info): + return info.get_content_charset('ascii') import zipfile import subprocess -import HTMLParser + + +def output(fo, fmt, *args, **kargs): + if not args: + args = kargs + fo.write((fmt % args) + '\n') + +def echo(fmt, *args, **kargs): + global opts + if opts.quiet: + return + output(sys.stdout, fmt, *args, **kargs) + +def error(fmt, *args, **kargs): + output(sys.stderr, fmt, *args, **kargs) + class SubDivXQuery: def __init__(self, to_search, page_number): @@ -22,27 +48,30 @@ class SubDivXQuery: @property def url(self): return 'http://%s%s?%s' % (self.host, self.page, - urllib.urlencode(self.query)) + urlencode(self.query)) @property def page_uri(self): - return self.page + '?' + urllib.urlencode(self.query) + return self.page + '?' + urlencode(self.query) @property def down_uri(self): return 'http://' + self.host + self.down_page -class SubDivXHTMLParser(HTMLParser.HTMLParser): +class SubDivXHTMLParser(HTMLParser): IDLE = 1 HEADER = 2 def __init__(self, down_uri): - HTMLParser.HTMLParser.__init__(self) + HTMLParser.__init__(self) self.down_uri = down_uri self.depth = 0 self.parsing = False self.subs = [] self.attr = None + self.attr_depth = 0 + self.cur = None + self.in_script_style = False def handle_starttag(self, tag, attrs): attrs = dict(attrs) @@ -52,22 +81,38 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): self.parsing = True if not self.parsing: return + if tag == 'script' or tag == 'style': + self.in_script_style = True + return if tag == 'div': if attrs.get('id') == 'buscador_detalle': self.parsing = True elif attrs.get('id') == 'buscador_detalle_sub': self.attr = 'desc' + self.attr_depth = self.depth + 1 + self.cur[self.attr] = '' elif tag == 'a': if attrs.get('class') == 'titulo_menu_izq': - self.attr = 'title' + self.attr = 'titulo' + self.attr_depth = self.depth + 1 + self.cur[self.attr] = '' elif attrs.get('href', '').startswith(self.down_uri): self.cur['url'] = attrs['href'] - if self.parsing: + # br are usually not closed, so ignore them in depth calculation + if self.parsing and tag != 'br': self.depth += 1 def handle_endtag(self, tag): if self.parsing: - self.depth -= 1 + if tag == 'script' or tag == 'style': + self.in_script_style = False + return + if self.depth == self.attr_depth: + self.attr = None + self.attr_depth = 0 + # see comment in handle_starttag() + if tag != 'br': + self.depth -= 1 if self.depth == 0: self.parsing = False @@ -75,16 +120,78 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): if not self.parsing: return data = data.strip() + # Hack to handle comments in