subdivxget

   1 #!/usr/bin/env python
   2
   3 import sys
   4 import urllib
   5 import zipfile
   6 import subprocess
   7 import HTMLParser
   8
   9 class SubDivXQuery:
  10         def __init__(self, to_search):
  11                 self.host = "www.subdivx.com"
  12                 self.page = "/index.php"
  13                 self.down_page = "/bajar.php"
  14                 self.query = dict(
  15                         buscar = to_search,
  16                         accion = 5,
  17                         masdesc = '',
  18                         subtitulos = 1,
  19                         realiza_b = 1,
  20                 )
  21         @property
  22         def url(self):
  23                 return 'http://%s%s?%s' % (self.host, self.page,
  24                                 urllib.urlencode(self.query))
  25         @property
  26         def page_uri(self):
  27                 return self.page + '?' + urllib.urlencode(self.query)
  28         @property
  29         def down_uri(self):
  30                 return 'http://' + self.host + self.down_page
  31
  32 class Subtitle:
  33         pass
  34
  35 class SubDivXHTMLParser(HTMLParser.HTMLParser):
  36
  37         IDLE = 1
  38         HEADER = 2
  39
  40         def __init__(self, down_uri):
  41                 HTMLParser.HTMLParser.__init__(self)
  42                 self.down_uri = down_uri
  43                 self.depth = 0
  44                 self.parsing = False
  45                 self.subs = []
  46                 self.attr = None
  47
  48         def handle_starttag(self, tag, attrs):
  49                 attrs = dict(attrs)
  50                 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
  51                         #self.cur = Subtitle()
  52                         self.cur = dict()
  53                         self.subs.append(self.cur)
  54                         self.parsing = True
  55                 if not self.parsing:
  56                         return
  57                 if tag == 'div':
  58                         if attrs.get('id') == 'buscador_detalle':
  59                                 self.parsing = True
  60                         elif attrs.get('id') == 'buscador_detalle_sub':
  61                                 self.attr = 'desc'
  62                 elif tag == 'a':
  63                         if attrs.get('class') == 'titulo_menu_izq':
  64                                 self.attr = 'title'
  65                         elif attrs.get('href', '').startswith(self.down_uri):
  66                                 self.cur['url'] = attrs['href']
  67                 if self.parsing:
  68                         self.depth += 1
  69
  70         def handle_endtag(self, tag):
  71                 if self.parsing:
  72                         self.depth -= 1
  73                 if self.depth == 0:
  74                         self.parsing = False
  75
  76         def handle_data(self, data):
  77                 if self.parsing:
  78                         data = data.strip()
  79                         if self.attr is not None and data:
  80                                 self.cur[self.attr] = data
  81                                 self.attr = None
  82                                 #self.cur[self.attr] = self.cur.get(self.attr, '') + data.strip()
  83                                 #setattr(self.cur, self.attr, data.strip())
  84                         elif data in ('Downloads:', 'Cds:', 'Comentarios:',
  85                                         'Formato:'):
  86                                 self.attr = data[:-1].lower()
  87                         elif data == 'Subido por:':
  88                                 self.attr = 'autor'
  89                         elif data == 'el':
  90                                 self.attr = 'fecha'
  91
  92
  93 def get_subs(query_str):
  94         query = SubDivXQuery(query_str)
  95
  96         url = urllib.urlopen(query.url)
  97
  98         parser = SubDivXHTMLParser(query.down_uri)
  99
 100         for line in url:
 101                 parser.feed(line)
 102
 103         url.close()
 104
 105         zip_exts = ('application/zip',)
 106         rar_exts = ('application/rar', 'application/x-rar-compressed')
 107
 108         for sub in sorted(parser.subs, key=lambda s: int(s['downloads']), reverse=True):
 109                 print '''\
 110         - %(title)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
 111           %(desc)s
 112                 DOWNLOADING ...
 113         ''' % sub
 114                 fname, headers = urllib.urlretrieve(sub['url'])
 115                 if 'Content-Type' in headers:
 116                         if headers['Content-Type'] in zip_exts:
 117                                 z = zipfile.ZipFile(fname, 'r')
 118                                 z.printdir()
 119                                 for fn in z.namelist():
 120                                         if fn.endswith('.srt') or fn.endswith('.sub'):
 121                                                 if '..' in fn or fn.startswith('/'):
 122                                                         print 'Dangerous file name:', fn
 123                                                         continue
 124                                                 print 'Extracting', fn, '...'
 125                                                 z.extract(fn)
 126                         elif headers['Content-Type'] in rar_exts:
 127                                 if subprocess.call(['rar', 'x', fname]) != 0:
 128                                         print 'Error unraring file %s' % fname
 129                         else:
 130                                 print 'Unrecognized file type:', headers['Content-Type']
 131                 else:
 132                         print 'No Content-Type!'
 133
 134
 135 for q in sys.argv[1:]:
 136         get_subs(q)
 137