subdivxget

   1 #!/usr/bin/env python
   2
   3 import sys
   4 if sys.version_info[0] < 3:
   5         from HTMLParser import HTMLParser
   6         from urllib import urlopen, urlretrieve, urlencode
   7         def get_encoding(info):
   8                 return info.getparam('charset')
   9
  10 else:
  11         from html.parser import HTMLParser
  12         from urllib.request import urlopen, urlretrieve
  13         from urllib.parse import urlencode
  14         def get_encoding(info):
  15                 return info.get_content_charset('ascii')
  16 import zipfile
  17 import subprocess
  18
  19
  20 def output(fo, fmt, *args, **kargs):
  21         if not args:
  22                 args = kargs
  23         fo.write((fmt % args) + '\n')
  24
  25 def echo(fmt, *args, **kargs):
  26         output(sys.stdout, fmt, *args, **kargs)
  27
  28 def error(fmt, *args, **kargs):
  29         output(sys.stderr, fmt, *args, **kargs)
  30
  31
  32 class SubDivXQuery:
  33         def __init__(self, to_search, page_number):
  34                 self.host = "www.subdivx.com"
  35                 self.page = "/index.php"
  36                 self.down_page = "/bajar.php"
  37                 self.query = dict(
  38                         buscar = to_search,
  39                         pg = page_number,
  40                         accion = 5,
  41                         masdesc = '',
  42                         subtitulos = 1,
  43                         realiza_b = 1,
  44                 )
  45         @property
  46         def url(self):
  47                 return 'http://%s%s?%s' % (self.host, self.page,
  48                                 urlencode(self.query))
  49         @property
  50         def page_uri(self):
  51                 return self.page + '?' + urlencode(self.query)
  52         @property
  53         def down_uri(self):
  54                 return 'http://' + self.host + self.down_page
  55
  56
  57 class SubDivXHTMLParser(HTMLParser):
  58
  59         IDLE = 1
  60         HEADER = 2
  61
  62         def __init__(self, down_uri):
  63                 HTMLParser.__init__(self)
  64                 self.down_uri = down_uri
  65                 self.depth = 0
  66                 self.parsing = False
  67                 self.subs = []
  68                 self.attr = None
  69                 self.attr_depth = 0
  70                 self.cur = None
  71                 self.in_script_style = False
  72
  73         def handle_starttag(self, tag, attrs):
  74                 attrs = dict(attrs)
  75                 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
  76                         self.cur = dict()
  77                         self.subs.append(self.cur)
  78                         self.parsing = True
  79                 if not self.parsing:
  80                         return
  81                 if tag == 'script' or tag == 'style':
  82                         self.in_script_style = True
  83                         return
  84                 if tag == 'div':
  85                         if attrs.get('id') == 'buscador_detalle':
  86                                 self.parsing = True
  87                         elif attrs.get('id') == 'buscador_detalle_sub':
  88                                 self.attr = 'desc'
  89                                 self.attr_depth = self.depth + 1
  90                                 self.cur[self.attr] = ''
  91                 elif tag == 'a':
  92                         if attrs.get('class') == 'titulo_menu_izq':
  93                                 self.attr = 'titulo'
  94                                 self.attr_depth = self.depth + 1
  95                                 self.cur[self.attr] = ''
  96                         elif attrs.get('href', '').startswith(self.down_uri):
  97                                 self.cur['url'] = attrs['href']
  98                 # br are usually not closed, so ignore them in depth calculation
  99                 if self.parsing and tag != 'br':
 100                         self.depth += 1
 101
 102         def handle_endtag(self, tag):
 103                 if self.parsing:
 104                         if tag == 'script' or tag == 'style':
 105                                 self.in_script_style = False
 106                                 return
 107                         if self.depth == self.attr_depth:
 108                                 self.attr = None
 109                                 self.attr_depth = 0
 110                         # see comment in handle_starttag()
 111                         if tag != 'br':
 112                                 self.depth -= 1
 113                 if self.depth == 0:
 114                         self.parsing = False
 115
 116         def handle_data(self, data):
 117                 if not self.parsing:
 118                         return
 119                 data = data.strip()
 120                 # Hack to handle comments in <script> <style> which don't end
 121                 # up in handle_comment(), so we just ignore the whole tags
 122                 if self.in_script_style:
 123                         return
 124                 if self.attr is not None and data:
 125                         self.cur[self.attr] += ' ' + data
 126                         if self.attr_depth == 0:
 127                                 self.cur[self.attr] = self.cur[self.attr].strip()
 128                                 self.attr = None
 129                                 self.attr_depth = 0
 130                 elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'):
 131                         self.attr = data[:-1].lower()
 132                         self.attr_depth = 0
 133                         self.cur[self.attr] = ''
 134                 elif data == 'Subido por:':
 135                         self.attr = 'autor'
 136                         self.attr_depth = 0
 137                         self.cur[self.attr] = ''
 138                 elif data == 'el':
 139                         self.attr = 'fecha'
 140                         self.attr_depth = 0
 141                         self.cur[self.attr] = ''
 142
 143
 144 def filter_subtitles(subs, filters):
 145         def is_good(sub, filter):
 146                 def is_any_good(sub, filter):
 147                         for value in sub.values():
 148                                 if value.lower().find(filter) >= 0:
 149                                         return True
 150
 151                 field = None
 152                 if len(filter) > 2 and filter[1] == ':':
 153                         field = filter[0]
 154                         filter = filter[2:]
 155                 filter = filter.lower()
 156
 157                 if field is None:
 158                         return is_any_good(sub, filter)
 159                 elif field == 't':
 160                         key = 'titulo'
 161                 elif field == 'd':
 162                         key = 'desc'
 163                 elif field == 'a':
 164                         key = 'autor'
 165                 elif field == 'f':
 166                         key = 'formato'
 167                 elif field == 'c':
 168                         key = 'comentarios'
 169                 elif field == 'C':
 170                         key = 'cds'
 171                 elif field == 'F':
 172                         key = 'fecha'
 173                 elif field == 'D':
 174                         key = 'downloads'
 175                 else:
 176                         # Not a recognizer field identifier, use the raw filter
 177                         return is_any_good(sub, field + ':' + filter)
 178
 179                 return sub[key].lower().find(filter) >= 0
 180
 181         if not filters:
 182                 return subs
 183
 184         result = []
 185         for sub in subs:
 186                 for filter in filters:
 187                         if not is_good(sub, filter):
 188                                 break
 189                 else:
 190                         result.append(sub)
 191         return result
 192
 193
 194 def subdivx_get_subs(query_str):
 195         page_number = 1
 196         subs = []
 197         while True:
 198                 query = SubDivXQuery(query_str, page_number)
 199                 url = urlopen(query.url)
 200                 parser = SubDivXHTMLParser(query.down_uri)
 201
 202                 try:
 203                         encoding = get_encoding(url.info())
 204                 except:
 205                         encoding = 'ascii'
 206
 207                 for line in url:
 208                         parser.feed(line.decode(encoding))
 209
 210                 url.close()
 211
 212                 if not parser.subs:
 213                         break
 214
 215                 subs.extend(parser.subs)
 216                 page_number += 1
 217
 218         return subs
 219
 220
 221 def get_subs(query_str, filters):
 222         sub_exts = ('.srt', '.sub')
 223         zip_exts = ('application/zip',)
 224         rar_exts = ('application/rar', 'application/x-rar-compressed')
 225
 226         subs = subdivx_get_subs(query_str)
 227         subs = filter_subtitles(subs, filters)
 228         subs.sort(key=lambda s: int(s['downloads']), reverse=True)
 229
 230         for sub in subs:
 231                 echo('''\
 232 - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
 233   %(desc)s
 234         DOWNLOADING ...
 235 ''', **sub)
 236                 fname, headers = urlretrieve(sub['url'])
 237                 if 'Content-Type' in headers:
 238                         if headers['Content-Type'] in zip_exts:
 239                                 z = zipfile.ZipFile(fname, 'r')
 240                                 z.printdir()
 241                                 for fn in z.namelist():
 242                                         if fn.endswith(sub_exts):
 243                                                 if '..' in fn or fn.startswith('/'):
 244                                                         error('Dangerous file name: %s', fn)
 245                                                         continue
 246                                                 echo('Extracting %s...', fn)
 247                                                 z.extract(fn)
 248                         elif headers['Content-Type'] in rar_exts:
 249                                 if subprocess.call(['rar', 'x', fname]) != 0:
 250                                         error('Error unraring file %s', fname)
 251                         else:
 252                                 error('Unrecognized file type:',
 253                                                 headers['Content-Type'])
 254                 else:
 255                         error('No Content-Type!')
 256
 257
 258 get_subs(sys.argv[1], sys.argv[2:])
 259