subdivxget

   1 #!/usr/bin/env python
   2
   3 import sys
   4 import urllib
   5 import zipfile
   6 import subprocess
   7 import HTMLParser
   8
   9 class SubDivXQuery:
  10         def __init__(self, to_search, page_number):
  11                 self.host = "www.subdivx.com"
  12                 self.page = "/index.php"
  13                 self.down_page = "/bajar.php"
  14                 self.query = dict(
  15                         buscar = to_search,
  16                         pg = page_number,
  17                         accion = 5,
  18                         masdesc = '',
  19                         subtitulos = 1,
  20                         realiza_b = 1,
  21                 )
  22         @property
  23         def url(self):
  24                 return 'http://%s%s?%s' % (self.host, self.page,
  25                                 urllib.urlencode(self.query))
  26         @property
  27         def page_uri(self):
  28                 return self.page + '?' + urllib.urlencode(self.query)
  29         @property
  30         def down_uri(self):
  31                 return 'http://' + self.host + self.down_page
  32
  33
  34 class SubDivXHTMLParser(HTMLParser.HTMLParser):
  35
  36         IDLE = 1
  37         HEADER = 2
  38
  39         def __init__(self, down_uri):
  40                 HTMLParser.HTMLParser.__init__(self)
  41                 self.down_uri = down_uri
  42                 self.depth = 0
  43                 self.parsing = False
  44                 self.subs = []
  45                 self.attr = None
  46                 self.cur = None
  47                 self.in_script_style = False
  48
  49         def handle_starttag(self, tag, attrs):
  50                 attrs = dict(attrs)
  51                 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
  52                         self.cur = dict()
  53                         self.subs.append(self.cur)
  54                         self.parsing = True
  55                 if not self.parsing:
  56                         return
  57                 if tag == 'script' or tag == 'style':
  58                         self.in_script_style = True
  59                         return
  60                 if tag == 'div':
  61                         if attrs.get('id') == 'buscador_detalle':
  62                                 self.parsing = True
  63                         elif attrs.get('id') == 'buscador_detalle_sub':
  64                                 self.attr = 'desc'
  65                 elif tag == 'a':
  66                         if attrs.get('class') == 'titulo_menu_izq':
  67                                 self.attr = 'titulo'
  68                         elif attrs.get('href', '').startswith(self.down_uri):
  69                                 self.cur['url'] = attrs['href']
  70                 # br are usually not closed, so ignore them in depth calculation
  71                 if self.parsing and tag != 'br':
  72                         self.depth += 1
  73
  74         def handle_endtag(self, tag):
  75                 if self.parsing:
  76                         if tag == 'script' or tag == 'style':
  77                                 self.in_script_style = False
  78                                 return
  79                         # see comment in handle_starttag()
  80                         if tag != 'br':
  81                                 self.depth -= 1
  82                 if self.depth == 0:
  83                         self.parsing = False
  84
  85         def handle_data(self, data):
  86                 if not self.parsing:
  87                         return
  88                 data = data.strip()
  89                 # Hack to handle comments in <script> <style> which don't end
  90                 # up in handle_comment(), so we just ignore the whole tags
  91                 if self.in_script_style:
  92                         return
  93                 if self.attr is not None and data:
  94                         self.cur[self.attr] = data
  95                         self.attr = None
  96                 elif data in ('Downloads:', 'Cds:', 'Comentarios:',
  97                                 'Formato:'):
  98                         self.attr = data[:-1].lower()
  99                 elif data == 'Subido por:':
 100                         self.attr = 'autor'
 101                 elif data == 'el':
 102                         self.attr = 'fecha'
 103
 104
 105 def subdivx_get_subs(query_str):
 106         page_number = 1
 107         subs = []
 108         while True:
 109                 query = SubDivXQuery(query_str, page_number)
 110                 url = urllib.urlopen(query.url)
 111                 parser = SubDivXHTMLParser(query.down_uri)
 112
 113                 for line in url:
 114                         parser.feed(line)
 115
 116                 url.close()
 117
 118                 if not parser.subs:
 119                         break
 120
 121                 subs.extend(parser.subs)
 122                 page_number += 1
 123
 124         return sorted(subs, key=lambda s: int(s['downloads']), reverse=True)
 125
 126
 127 def get_subs(query_str):
 128         zip_exts = ('application/zip',)
 129         rar_exts = ('application/rar', 'application/x-rar-compressed')
 130
 131         for sub in subdivx_get_subs(query_str):
 132                 print '''\
 133         - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
 134           %(desc)s
 135                 DOWNLOADING ...
 136         ''' % sub
 137                 fname, headers = urllib.urlretrieve(sub['url'])
 138                 if 'Content-Type' in headers:
 139                         if headers['Content-Type'] in zip_exts:
 140                                 z = zipfile.ZipFile(fname, 'r')
 141                                 z.printdir()
 142                                 for fn in z.namelist():
 143                                         if fn.endswith('.srt') or fn.endswith('.sub'):
 144                                                 if '..' in fn or fn.startswith('/'):
 145                                                         print 'Dangerous file name:', fn
 146                                                         continue
 147                                                 print 'Extracting', fn, '...'
 148                                                 z.extract(fn)
 149                         elif headers['Content-Type'] in rar_exts:
 150                                 if subprocess.call(['rar', 'x', fname]) != 0:
 151                                         print 'Error unraring file %s' % fname
 152                         else:
 153                                 print 'Unrecognized file type:', headers['Content-Type']
 154                 else:
 155                         print 'No Content-Type!'
 156
 157
 158 for q in sys.argv[1:]:
 159         get_subs(q)
 160