subdivxget

   1 #!/usr/bin/env python
   2
   3 import sys
   4 import urllib
   5 import zipfile
   6 import subprocess
   7 import HTMLParser
   8
   9 class SubDivXQuery:
  10         def __init__(self, to_search):
  11                 self.host = "www.subdivx.com"
  12                 self.page = "/index.php"
  13                 self.down_page = "/bajar.php"
  14                 self.query = dict(
  15                         buscar = to_search,
  16                         accion = 5,
  17                         masdesc = '',
  18                         subtitulos = 1,
  19                         realiza_b = 1,
  20                 )
  21         @property
  22         def url(self):
  23                 return 'http://%s%s?%s' % (self.host, self.page,
  24                                 urllib.urlencode(self.query))
  25         @property
  26         def page_uri(self):
  27                 return self.page + '?' + urllib.urlencode(self.query)
  28         @property
  29         def down_uri(self):
  30                 return 'http://' + self.host + self.down_page
  31
  32
  33 class SubDivXHTMLParser(HTMLParser.HTMLParser):
  34
  35         IDLE = 1
  36         HEADER = 2
  37
  38         def __init__(self, down_uri):
  39                 HTMLParser.HTMLParser.__init__(self)
  40                 self.down_uri = down_uri
  41                 self.depth = 0
  42                 self.parsing = False
  43                 self.subs = []
  44                 self.attr = None
  45
  46         def handle_starttag(self, tag, attrs):
  47                 attrs = dict(attrs)
  48                 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
  49                         self.cur = dict()
  50                         self.subs.append(self.cur)
  51                         self.parsing = True
  52                 if not self.parsing:
  53                         return
  54                 if tag == 'div':
  55                         if attrs.get('id') == 'buscador_detalle':
  56                                 self.parsing = True
  57                         elif attrs.get('id') == 'buscador_detalle_sub':
  58                                 self.attr = 'desc'
  59                 elif tag == 'a':
  60                         if attrs.get('class') == 'titulo_menu_izq':
  61                                 self.attr = 'title'
  62                         elif attrs.get('href', '').startswith(self.down_uri):
  63                                 self.cur['url'] = attrs['href']
  64                 if self.parsing:
  65                         self.depth += 1
  66
  67         def handle_endtag(self, tag):
  68                 if self.parsing:
  69                         self.depth -= 1
  70                 if self.depth == 0:
  71                         self.parsing = False
  72
  73         def handle_data(self, data):
  74                 if self.parsing:
  75                         data = data.strip()
  76                         if self.attr is not None and data:
  77                                 self.cur[self.attr] = data
  78                                 self.attr = None
  79                         elif data in ('Downloads:', 'Cds:', 'Comentarios:',
  80                                         'Formato:'):
  81                                 self.attr = data[:-1].lower()
  82                         elif data == 'Subido por:':
  83                                 self.attr = 'autor'
  84                         elif data == 'el':
  85                                 self.attr = 'fecha'
  86
  87
  88 def get_subs(query_str):
  89         query = SubDivXQuery(query_str)
  90
  91         url = urllib.urlopen(query.url)
  92
  93         parser = SubDivXHTMLParser(query.down_uri)
  94
  95         for line in url:
  96                 parser.feed(line)
  97
  98         url.close()
  99
 100         zip_exts = ('application/zip',)
 101         rar_exts = ('application/rar', 'application/x-rar-compressed')
 102
 103         for sub in sorted(parser.subs, key=lambda s: int(s['downloads']), reverse=True):
 104                 print '''\
 105         - %(title)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
 106           %(desc)s
 107                 DOWNLOADING ...
 108         ''' % sub
 109                 fname, headers = urllib.urlretrieve(sub['url'])
 110                 if 'Content-Type' in headers:
 111                         if headers['Content-Type'] in zip_exts:
 112                                 z = zipfile.ZipFile(fname, 'r')
 113                                 z.printdir()
 114                                 for fn in z.namelist():
 115                                         if fn.endswith('.srt') or fn.endswith('.sub'):
 116                                                 if '..' in fn or fn.startswith('/'):
 117                                                         print 'Dangerous file name:', fn
 118                                                         continue
 119                                                 print 'Extracting', fn, '...'
 120                                                 z.extract(fn)
 121                         elif headers['Content-Type'] in rar_exts:
 122                                 if subprocess.call(['rar', 'x', fname]) != 0:
 123                                         print 'Error unraring file %s' % fname
 124                         else:
 125                                 print 'Unrecognized file type:', headers['Content-Type']
 126                 else:
 127                         print 'No Content-Type!'
 128
 129
 130 for q in sys.argv[1:]:
 131         get_subs(q)
 132