subdivxget

   1 #!/usr/bin/env python
   2
   3 import sys
   4 if sys.version_info[0] < 3:
   5         from HTMLParser import HTMLParser
   6         from urllib import urlopen, urlretrieve, urlencode
   7         def get_encoding(info):
   8                 return info.getparam('charset')
   9
  10 else:
  11         from html.parser import HTMLParser
  12         from urllib.request import urlopen, urlretrieve
  13         from urllib.parse import urlencode
  14         def get_encoding(info):
  15                 return info.get_content_charset('ascii')
  16 import zipfile
  17 import subprocess
  18
  19
  20 def output(fo, fmt, *args, **kargs):
  21         if not args:
  22                 args = kargs
  23         fo.write((fmt % args) + '\n')
  24
  25 def echo(fmt, *args, **kargs):
  26         global opts
  27         if opts.quiet:
  28                 return
  29         output(sys.stdout, fmt, *args, **kargs)
  30
  31 def error(fmt, *args, **kargs):
  32         output(sys.stderr, fmt, *args, **kargs)
  33
  34
  35 class SubDivXQuery:
  36         def __init__(self, to_search, page_number):
  37                 self.host = "www.subdivx.com"
  38                 self.page = "/index.php"
  39                 self.down_page = "/bajar.php"
  40                 self.query = dict(
  41                         buscar = to_search,
  42                         pg = page_number,
  43                         accion = 5,
  44                         masdesc = '',
  45                         subtitulos = 1,
  46                         realiza_b = 1,
  47                 )
  48         @property
  49         def url(self):
  50                 return 'http://%s%s?%s' % (self.host, self.page,
  51                                 urlencode(self.query))
  52         @property
  53         def page_uri(self):
  54                 return self.page + '?' + urlencode(self.query)
  55         @property
  56         def down_uri(self):
  57                 return 'http://' + self.host + self.down_page
  58
  59
  60 class SubDivXHTMLParser(HTMLParser):
  61
  62         IDLE = 1
  63         HEADER = 2
  64
  65         def __init__(self, down_uri):
  66                 HTMLParser.__init__(self)
  67                 self.down_uri = down_uri
  68                 self.depth = 0
  69                 self.parsing = False
  70                 self.subs = []
  71                 self.attr = None
  72                 self.attr_depth = 0
  73                 self.cur = None
  74                 self.in_script_style = False
  75
  76         def handle_starttag(self, tag, attrs):
  77                 attrs = dict(attrs)
  78                 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
  79                         self.cur = dict()
  80                         self.subs.append(self.cur)
  81                         self.parsing = True
  82                 if not self.parsing:
  83                         return
  84                 if tag == 'script' or tag == 'style':
  85                         self.in_script_style = True
  86                         return
  87                 if tag == 'div':
  88                         if attrs.get('id') == 'buscador_detalle':
  89                                 self.parsing = True
  90                         elif attrs.get('id') == 'buscador_detalle_sub':
  91                                 self.attr = 'desc'
  92                                 self.attr_depth = self.depth + 1
  93                                 self.cur[self.attr] = ''
  94                 elif tag == 'a':
  95                         if attrs.get('class') == 'titulo_menu_izq':
  96                                 self.attr = 'titulo'
  97                                 self.attr_depth = self.depth + 1
  98                                 self.cur[self.attr] = ''
  99                         elif attrs.get('href', '').startswith(self.down_uri):
 100                                 self.cur['url'] = attrs['href']
 101                 # br are usually not closed, so ignore them in depth calculation
 102                 if self.parsing and tag != 'br':
 103                         self.depth += 1
 104
 105         def handle_endtag(self, tag):
 106                 if self.parsing:
 107                         if tag == 'script' or tag == 'style':
 108                                 self.in_script_style = False
 109                                 return
 110                         if self.depth == self.attr_depth:
 111                                 self.attr = None
 112                                 self.attr_depth = 0
 113                         # see comment in handle_starttag()
 114                         if tag != 'br':
 115                                 self.depth -= 1
 116                 if self.depth == 0:
 117                         self.parsing = False
 118
 119         def handle_data(self, data):
 120                 if not self.parsing:
 121                         return
 122                 data = data.strip()
 123                 # Hack to handle comments in <script> <style> which don't end
 124                 # up in handle_comment(), so we just ignore the whole tags
 125                 if self.in_script_style:
 126                         return
 127                 if self.attr is not None and data:
 128                         self.cur[self.attr] += ' ' + data
 129                         if self.attr_depth == 0:
 130                                 self.cur[self.attr] = self.cur[self.attr].strip()
 131                                 self.attr = None
 132                                 self.attr_depth = 0
 133                 elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'):
 134                         self.attr = data[:-1].lower()
 135                         self.attr_depth = 0
 136                         self.cur[self.attr] = ''
 137                 elif data == 'Subido por:':
 138                         self.attr = 'autor'
 139                         self.attr_depth = 0
 140                         self.cur[self.attr] = ''
 141                 elif data == 'el':
 142                         self.attr = 'fecha'
 143                         self.attr_depth = 0
 144                         self.cur[self.attr] = ''
 145
 146
 147 def filter_subtitles(subs, filters):
 148         def is_good(sub, filter):
 149                 def is_any_good(sub, filter):
 150                         for value in sub.values():
 151                                 if value.lower().find(filter) >= 0:
 152                                         return True
 153
 154                 field = None
 155                 if len(filter) > 2 and filter[1] == ':':
 156                         field = filter[0]
 157                         filter = filter[2:]
 158                 filter = filter.lower()
 159
 160                 if field is None:
 161                         return is_any_good(sub, filter)
 162                 elif field == 't':
 163                         key = 'titulo'
 164                 elif field == 'd':
 165                         key = 'desc'
 166                 elif field == 'a':
 167                         key = 'autor'
 168                 elif field == 'f':
 169                         key = 'formato'
 170                 elif field == 'c':
 171                         key = 'comentarios'
 172                 elif field == 'C':
 173                         key = 'cds'
 174                 elif field == 'F':
 175                         key = 'fecha'
 176                 elif field == 'D':
 177                         key = 'downloads'
 178                 else:
 179                         # Not a recognizer field identifier, use the raw filter
 180                         return is_any_good(sub, field + ':' + filter)
 181
 182                 return sub[key].lower().find(filter) >= 0
 183
 184         if not filters:
 185                 return subs
 186
 187         result = []
 188         for sub in subs:
 189                 for filter in filters:
 190                         if not is_good(sub, filter):
 191                                 break
 192                 else:
 193                         result.append(sub)
 194         return result
 195
 196
 197 def subdivx_get_subs(query_str):
 198         page_number = 1
 199         subs = []
 200         while True:
 201                 query = SubDivXQuery(query_str, page_number)
 202                 url = urlopen(query.url)
 203                 parser = SubDivXHTMLParser(query.down_uri)
 204
 205                 try:
 206                         encoding = get_encoding(url.info())
 207                 except:
 208                         encoding = 'ascii'
 209
 210                 for line in url:
 211                         parser.feed(line.decode(encoding))
 212
 213                 url.close()
 214
 215                 if not parser.subs:
 216                         break
 217
 218                 subs.extend(parser.subs)
 219                 page_number += 1
 220
 221         return subs
 222
 223
 224 def unzip_subs(fname):
 225         sub_exts = ('.srt', '.sub')
 226         z = zipfile.ZipFile(fname, 'r')
 227         z.printdir()
 228         for fn in z.namelist():
 229                 if fn.endswith(sub_exts):
 230                         if '..' in fn or fn.startswith('/'):
 231                                 error('Ignoring file with dangerous name: %s',
 232                                                 fn)
 233                                 continue
 234                         echo('Extracting %s...', fn)
 235                         z.extract(fn)
 236
 237
 238 def get_subs(query_str, filters):
 239         global opts
 240         zip_exts = ('application/zip',)
 241         rar_exts = ('application/rar', 'application/x-rar-compressed')
 242
 243         subs = subdivx_get_subs(query_str)
 244         subs = filter_subtitles(subs, filters)
 245         subs.sort(key=lambda s: int(s['downloads']), reverse=True)
 246
 247         for sub in subs:
 248                 echo('''\
 249 - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
 250   %(desc)s
 251         DOWNLOADING ...
 252 ''', **sub)
 253                 if opts.list_only:
 254                         continue
 255                 fname, headers = urlretrieve(sub['url'])
 256                 if 'Content-Type' in headers:
 257                         if headers['Content-Type'] in zip_exts:
 258                                 unzip_subs(fname)
 259                         elif headers['Content-Type'] in rar_exts:
 260                                 if subprocess.call(['rar', 'x', fname]) != 0:
 261                                         error('Error unraring file %s', fname)
 262                         else:
 263                                 error('Unrecognized file type:',
 264                                                 headers['Content-Type'])
 265                 else:
 266                         error('No Content-Type!')
 267
 268
 269 def parse_args(argv):
 270         from optparse import OptionParser
 271         parser = OptionParser(usage="%prog [OPTIONS] QUERY [FILTER ...]",
 272                         description="""
 273 Download subtitles from subdivx.com searching the string QUERY. If FILTERs are
 274 specified, only subtitles that matches all those filters are downloaded.
 275 Filters have the format "X:fitler", where X is a field specification: t=titulo,
 276 d=desc, a=autor, f=formato, c=comentarios, C=cds, F=fecha and D=downloads.
 277 filter is a string that should be found on that field (case insensitive). If
 278 the format specifier is not known (or there isn't one) the filter string is
 279 looked in all the fields.
 280                         """.strip())
 281         parser.add_option("-l", "--list-only",
 282                         default=False, action='store_true',
 283                         help="don't download the subtitles, just list them")
 284         parser.add_option("-q", "--quiet",
 285                         default=False, action='store_true',
 286                         help="don't print progress messages")
 287
 288         (opts, args) = parser.parse_args()
 289         if not args:
 290               parser.error("Missing query string")
 291
 292         if opts.quiet and opts.list_only:
 293                 parser.error("Using --quiet and --list-only together doesn't "
 294                                 "make any sense")
 295
 296         return (args[0], args[1:], opts)
 297
 298 (query_str, filters, opts) = parse_args(sys.argv)
 299
 300 get_subs(query_str, filters)
 301
 302