subdivxget

   1 #!/usr/bin/env python
   2
   3 import sys
   4 import urllib
   5 import zipfile
   6 import subprocess
   7 import HTMLParser
   8
   9 class SubDivXQuery:
  10         def __init__(self, to_search, page_number):
  11                 self.host = "www.subdivx.com"
  12                 self.page = "/index.php"
  13                 self.down_page = "/bajar.php"
  14                 self.query = dict(
  15                         buscar = to_search,
  16                         pg = page_number,
  17                         accion = 5,
  18                         masdesc = '',
  19                         subtitulos = 1,
  20                         realiza_b = 1,
  21                 )
  22         @property
  23         def url(self):
  24                 return 'http://%s%s?%s' % (self.host, self.page,
  25                                 urllib.urlencode(self.query))
  26         @property
  27         def page_uri(self):
  28                 return self.page + '?' + urllib.urlencode(self.query)
  29         @property
  30         def down_uri(self):
  31                 return 'http://' + self.host + self.down_page
  32
  33
  34 class SubDivXHTMLParser(HTMLParser.HTMLParser):
  35
  36         IDLE = 1
  37         HEADER = 2
  38
  39         def __init__(self, down_uri):
  40                 HTMLParser.HTMLParser.__init__(self)
  41                 self.down_uri = down_uri
  42                 self.depth = 0
  43                 self.parsing = False
  44                 self.subs = []
  45                 self.attr = None
  46                 self.attr_depth = 0
  47                 self.cur = None
  48                 self.in_script_style = False
  49
  50         def handle_starttag(self, tag, attrs):
  51                 attrs = dict(attrs)
  52                 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
  53                         self.cur = dict()
  54                         self.subs.append(self.cur)
  55                         self.parsing = True
  56                 if not self.parsing:
  57                         return
  58                 if tag == 'script' or tag == 'style':
  59                         self.in_script_style = True
  60                         return
  61                 if tag == 'div':
  62                         if attrs.get('id') == 'buscador_detalle':
  63                                 self.parsing = True
  64                         elif attrs.get('id') == 'buscador_detalle_sub':
  65                                 self.attr = 'desc'
  66                                 self.attr_depth = self.depth + 1
  67                                 self.cur[self.attr] = ''
  68                 elif tag == 'a':
  69                         if attrs.get('class') == 'titulo_menu_izq':
  70                                 self.attr = 'titulo'
  71                                 self.attr_depth = self.depth + 1
  72                                 self.cur[self.attr] = ''
  73                         elif attrs.get('href', '').startswith(self.down_uri):
  74                                 self.cur['url'] = attrs['href']
  75                 # br are usually not closed, so ignore them in depth calculation
  76                 if self.parsing and tag != 'br':
  77                         self.depth += 1
  78
  79         def handle_endtag(self, tag):
  80                 if self.parsing:
  81                         if tag == 'script' or tag == 'style':
  82                                 self.in_script_style = False
  83                                 return
  84                         if self.depth == self.attr_depth:
  85                                 self.attr = None
  86                                 self.attr_depth = 0
  87                         # see comment in handle_starttag()
  88                         if tag != 'br':
  89                                 self.depth -= 1
  90                 if self.depth == 0:
  91                         self.parsing = False
  92
  93         def handle_data(self, data):
  94                 if not self.parsing:
  95                         return
  96                 data = data.strip()
  97                 # Hack to handle comments in <script> <style> which don't end
  98                 # up in handle_comment(), so we just ignore the whole tags
  99                 if self.in_script_style:
 100                         return
 101                 if self.attr is not None and data:
 102                         self.cur[self.attr] += ' ' + data
 103                         if self.attr_depth == 0:
 104                                 self.cur[self.attr] = self.cur[self.attr].strip()
 105                                 self.attr = None
 106                                 self.attr_depth = 0
 107                 elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'):
 108                         self.attr = data[:-1].lower()
 109                         self.attr_depth = 0
 110                         self.cur[self.attr] = ''
 111                 elif data == 'Subido por:':
 112                         self.attr = 'autor'
 113                         self.attr_depth = 0
 114                         self.cur[self.attr] = ''
 115                 elif data == 'el':
 116                         self.attr = 'fecha'
 117                         self.attr_depth = 0
 118                         self.cur[self.attr] = ''
 119
 120
 121 def filter_subtitles(subs, filters):
 122         def is_good(sub, filter):
 123                 def is_any_good(sub, filter):
 124                         for value in sub.values():
 125                                 if value.lower().find(filter) >= 0:
 126                                         return True
 127
 128                 field = None
 129                 if len(filter) > 2 and filter[1] == ':':
 130                         field = filter[0]
 131                         filter = filter[2:]
 132                 filter = filter.lower()
 133
 134                 if field is None:
 135                         return is_any_good(sub, filter)
 136                 elif field == 't':
 137                         key = 'titulo'
 138                 elif field == 'd':
 139                         key = 'desc'
 140                 elif field == 'a':
 141                         key = 'autor'
 142                 elif field == 'f':
 143                         key = 'formato'
 144                 elif field == 'c':
 145                         key = 'comentarios'
 146                 elif field == 'C':
 147                         key = 'cds'
 148                 elif field == 'F':
 149                         key = 'fecha'
 150                 elif field == 'D':
 151                         key = 'downloads'
 152                 else:
 153                         # Not a recognizer field identifier, use the raw filter
 154                         return is_any_good(sub, field + ':' + filter)
 155
 156                 return sub[key].lower().find(filter) >= 0
 157
 158         if not filters:
 159                 return subs
 160
 161         result = []
 162         for sub in subs:
 163                 for filter in filters:
 164                         if not is_good(sub, filter):
 165                                 break
 166                 else:
 167                         result.append(sub)
 168         return result
 169
 170
 171 def subdivx_get_subs(query_str):
 172         page_number = 1
 173         subs = []
 174         while True:
 175                 query = SubDivXQuery(query_str, page_number)
 176                 url = urllib.urlopen(query.url)
 177                 parser = SubDivXHTMLParser(query.down_uri)
 178
 179                 for line in url:
 180                         parser.feed(line)
 181
 182                 url.close()
 183
 184                 if not parser.subs:
 185                         break
 186
 187                 subs.extend(parser.subs)
 188                 page_number += 1
 189
 190         return sorted(subs, key=lambda s: int(s['downloads']), reverse=True)
 191
 192
 193 def get_subs(query_str, filters):
 194         zip_exts = ('application/zip',)
 195         rar_exts = ('application/rar', 'application/x-rar-compressed')
 196
 197         subs = subdivx_get_subs(query_str)
 198         subs = filter_subtitles(subs, filters)
 199
 200         for sub in subs:
 201                 print '''\
 202         - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
 203           %(desc)s
 204                 DOWNLOADING ...
 205         ''' % sub
 206                 fname, headers = urllib.urlretrieve(sub['url'])
 207                 if 'Content-Type' in headers:
 208                         if headers['Content-Type'] in zip_exts:
 209                                 z = zipfile.ZipFile(fname, 'r')
 210                                 z.printdir()
 211                                 for fn in z.namelist():
 212                                         if fn.endswith('.srt') or fn.endswith('.sub'):
 213                                                 if '..' in fn or fn.startswith('/'):
 214                                                         print 'Dangerous file name:', fn
 215                                                         continue
 216                                                 print 'Extracting', fn, '...'
 217                                                 z.extract(fn)
 218                         elif headers['Content-Type'] in rar_exts:
 219                                 if subprocess.call(['rar', 'x', fname]) != 0:
 220                                         print 'Error unraring file %s' % fname
 221                         else:
 222                                 print 'Unrecognized file type:', headers['Content-Type']
 223                 else:
 224                         print 'No Content-Type!'
 225
 226
 227 get_subs(sys.argv[1], sys.argv[2:])
 228