subdivxget

   1 #!/usr/bin/env python
   2
   3 import sys
   4 if sys.version_info[0] < 3:
   5         from HTMLParser import HTMLParser
   6         from urllib import urlopen, urlretrieve, urlencode
   7         def get_encoding(info):
   8                 return info.getparam('charset')
   9
  10 else:
  11         from html.parser import HTMLParser
  12         from urllib.request import urlopen, urlretrieve
  13         from urllib.parse import urlencode
  14         def get_encoding(info):
  15                 return info.get_content_charset('ascii')
  16 import zipfile
  17 import subprocess
  18
  19
  20 def output(fo, fmt, *args, **kargs):
  21         if not args:
  22                 args = kargs
  23         fo.write((fmt % args) + '\n')
  24
  25 def echo(fmt, *args, **kargs):
  26         output(sys.stdout, fmt, *args, **kargs)
  27
  28 def error(fmt, *args, **kargs):
  29         output(sys.stderr, fmt, *args, **kargs)
  30
  31
  32 class SubDivXQuery:
  33         def __init__(self, to_search, page_number):
  34                 self.host = "www.subdivx.com"
  35                 self.page = "/index.php"
  36                 self.down_page = "/bajar.php"
  37                 self.query = dict(
  38                         buscar = to_search,
  39                         pg = page_number,
  40                         accion = 5,
  41                         masdesc = '',
  42                         subtitulos = 1,
  43                         realiza_b = 1,
  44                 )
  45         @property
  46         def url(self):
  47                 return 'http://%s%s?%s' % (self.host, self.page,
  48                                 urlencode(self.query))
  49         @property
  50         def page_uri(self):
  51                 return self.page + '?' + urlencode(self.query)
  52         @property
  53         def down_uri(self):
  54                 return 'http://' + self.host + self.down_page
  55
  56
  57 class SubDivXHTMLParser(HTMLParser):
  58
  59         IDLE = 1
  60         HEADER = 2
  61
  62         def __init__(self, down_uri):
  63                 HTMLParser.__init__(self)
  64                 self.down_uri = down_uri
  65                 self.depth = 0
  66                 self.parsing = False
  67                 self.subs = []
  68                 self.attr = None
  69                 self.attr_depth = 0
  70                 self.cur = None
  71                 self.in_script_style = False
  72
  73         def handle_starttag(self, tag, attrs):
  74                 attrs = dict(attrs)
  75                 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
  76                         self.cur = dict()
  77                         self.subs.append(self.cur)
  78                         self.parsing = True
  79                 if not self.parsing:
  80                         return
  81                 if tag == 'script' or tag == 'style':
  82                         self.in_script_style = True
  83                         return
  84                 if tag == 'div':
  85                         if attrs.get('id') == 'buscador_detalle':
  86                                 self.parsing = True
  87                         elif attrs.get('id') == 'buscador_detalle_sub':
  88                                 self.attr = 'desc'
  89                                 self.attr_depth = self.depth + 1
  90                                 self.cur[self.attr] = ''
  91                 elif tag == 'a':
  92                         if attrs.get('class') == 'titulo_menu_izq':
  93                                 self.attr = 'titulo'
  94                                 self.attr_depth = self.depth + 1
  95                                 self.cur[self.attr] = ''
  96                         elif attrs.get('href', '').startswith(self.down_uri):
  97                                 self.cur['url'] = attrs['href']
  98                 # br are usually not closed, so ignore them in depth calculation
  99                 if self.parsing and tag != 'br':
 100                         self.depth += 1
 101
 102         def handle_endtag(self, tag):
 103                 if self.parsing:
 104                         if tag == 'script' or tag == 'style':
 105                                 self.in_script_style = False
 106                                 return
 107                         if self.depth == self.attr_depth:
 108                                 self.attr = None
 109                                 self.attr_depth = 0
 110                         # see comment in handle_starttag()
 111                         if tag != 'br':
 112                                 self.depth -= 1
 113                 if self.depth == 0:
 114                         self.parsing = False
 115
 116         def handle_data(self, data):
 117                 if not self.parsing:
 118                         return
 119                 data = data.strip()
 120                 # Hack to handle comments in <script> <style> which don't end
 121                 # up in handle_comment(), so we just ignore the whole tags
 122                 if self.in_script_style:
 123                         return
 124                 if self.attr is not None and data:
 125                         self.cur[self.attr] += ' ' + data
 126                         if self.attr_depth == 0:
 127                                 self.cur[self.attr] = self.cur[self.attr].strip()
 128                                 self.attr = None
 129                                 self.attr_depth = 0
 130                 elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'):
 131                         self.attr = data[:-1].lower()
 132                         self.attr_depth = 0
 133                         self.cur[self.attr] = ''
 134                 elif data == 'Subido por:':
 135                         self.attr = 'autor'
 136                         self.attr_depth = 0
 137                         self.cur[self.attr] = ''
 138                 elif data == 'el':
 139                         self.attr = 'fecha'
 140                         self.attr_depth = 0
 141                         self.cur[self.attr] = ''
 142
 143
 144 def filter_subtitles(subs, filters):
 145         def is_good(sub, filter):
 146                 def is_any_good(sub, filter):
 147                         for value in sub.values():
 148                                 if value.lower().find(filter) >= 0:
 149                                         return True
 150
 151                 field = None
 152                 if len(filter) > 2 and filter[1] == ':':
 153                         field = filter[0]
 154                         filter = filter[2:]
 155                 filter = filter.lower()
 156
 157                 if field is None:
 158                         return is_any_good(sub, filter)
 159                 elif field == 't':
 160                         key = 'titulo'
 161                 elif field == 'd':
 162                         key = 'desc'
 163                 elif field == 'a':
 164                         key = 'autor'
 165                 elif field == 'f':
 166                         key = 'formato'
 167                 elif field == 'c':
 168                         key = 'comentarios'
 169                 elif field == 'C':
 170                         key = 'cds'
 171                 elif field == 'F':
 172                         key = 'fecha'
 173                 elif field == 'D':
 174                         key = 'downloads'
 175                 else:
 176                         # Not a recognizer field identifier, use the raw filter
 177                         return is_any_good(sub, field + ':' + filter)
 178
 179                 return sub[key].lower().find(filter) >= 0
 180
 181         if not filters:
 182                 return subs
 183
 184         result = []
 185         for sub in subs:
 186                 for filter in filters:
 187                         if not is_good(sub, filter):
 188                                 break
 189                 else:
 190                         result.append(sub)
 191         return result
 192
 193
 194 def subdivx_get_subs(query_str):
 195         page_number = 1
 196         subs = []
 197         while True:
 198                 query = SubDivXQuery(query_str, page_number)
 199                 url = urlopen(query.url)
 200                 parser = SubDivXHTMLParser(query.down_uri)
 201
 202                 try:
 203                         encoding = get_encoding(url.info())
 204                 except:
 205                         encoding = 'ascii'
 206
 207                 for line in url:
 208                         parser.feed(line.decode(encoding))
 209
 210                 url.close()
 211
 212                 if not parser.subs:
 213                         break
 214
 215                 subs.extend(parser.subs)
 216                 page_number += 1
 217
 218         return subs
 219
 220
 221 def unzip_subs(fname):
 222         sub_exts = ('.srt', '.sub')
 223         z = zipfile.ZipFile(fname, 'r')
 224         z.printdir()
 225         for fn in z.namelist():
 226                 if fn.endswith(sub_exts):
 227                         if '..' in fn or fn.startswith('/'):
 228                                 error('Ignoring file with dangerous name: %s',
 229                                                 fn)
 230                                 continue
 231                         echo('Extracting %s...', fn)
 232                         z.extract(fn)
 233
 234
 235 def get_subs(query_str, filters):
 236         global opts
 237         zip_exts = ('application/zip',)
 238         rar_exts = ('application/rar', 'application/x-rar-compressed')
 239
 240         subs = subdivx_get_subs(query_str)
 241         subs = filter_subtitles(subs, filters)
 242         subs.sort(key=lambda s: int(s['downloads']), reverse=True)
 243
 244         for sub in subs:
 245                 echo('''\
 246 - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
 247   %(desc)s
 248         DOWNLOADING ...
 249 ''', **sub)
 250                 if opts.list_only:
 251                         continue
 252                 fname, headers = urlretrieve(sub['url'])
 253                 if 'Content-Type' in headers:
 254                         if headers['Content-Type'] in zip_exts:
 255                                 unzip_subs(fname)
 256                         elif headers['Content-Type'] in rar_exts:
 257                                 if subprocess.call(['rar', 'x', fname]) != 0:
 258                                         error('Error unraring file %s', fname)
 259                         else:
 260                                 error('Unrecognized file type:',
 261                                                 headers['Content-Type'])
 262                 else:
 263                         error('No Content-Type!')
 264
 265
 266 def parse_args(argv):
 267         from optparse import OptionParser
 268         parser = OptionParser(usage="%prog [OPTIONS] QUERY [FILTER ...]",
 269                         description="""
 270 Download subtitles from subdivx.com searching the string QUERY. If FILTERs are
 271 specified, only subtitles that matches all those filters are downloaded.
 272 Filters have the format "X:fitler", where X is a field specification: t=titulo,
 273 d=desc, a=autor, f=formato, c=comentarios, C=cds, F=fecha and D=downloads.
 274 filter is a string that should be found on that field (case insensitive). If
 275 the format specifier is not known (or there isn't one) the filter string is
 276 looked in all the fields.
 277                         """.strip())
 278         parser.add_option("-l", "--list-only",
 279                         default=False, action='store_true',
 280                         help="Don't download the subtitles, just list them")
 281
 282         (opts, args) = parser.parse_args()
 283         if not args:
 284               parser.error("Missing query string")
 285
 286         return (args[0], args[1:], opts)
 287
 288 (query_str, filters, opts) = parse_args(sys.argv)
 289
 290 get_subs(query_str, filters)
 291
 292