subdivxget

   1 #!/usr/bin/env python
   2
   3 import sys
   4 if sys.version_info[0] < 3:
   5         from HTMLParser import HTMLParser
   6         from urllib import urlopen, urlretrieve, urlencode
   7         def get_encoding(info):
   8                 return info.getparam('charset')
   9
  10 else:
  11         from html.parser import HTMLParser
  12         from urllib.request import urlopen, urlretrieve
  13         from urllib.parse import urlencode
  14         def get_encoding(info):
  15                 return info.get_content_charset('ascii')
  16 import zipfile
  17 import subprocess
  18
  19 class SubDivXQuery:
  20         def __init__(self, to_search, page_number):
  21                 self.host = "www.subdivx.com"
  22                 self.page = "/index.php"
  23                 self.down_page = "/bajar.php"
  24                 self.query = dict(
  25                         buscar = to_search,
  26                         pg = page_number,
  27                         accion = 5,
  28                         masdesc = '',
  29                         subtitulos = 1,
  30                         realiza_b = 1,
  31                 )
  32         @property
  33         def url(self):
  34                 return 'http://%s%s?%s' % (self.host, self.page,
  35                                 urlencode(self.query))
  36         @property
  37         def page_uri(self):
  38                 return self.page + '?' + urlencode(self.query)
  39         @property
  40         def down_uri(self):
  41                 return 'http://' + self.host + self.down_page
  42
  43
  44 class SubDivXHTMLParser(HTMLParser):
  45
  46         IDLE = 1
  47         HEADER = 2
  48
  49         def __init__(self, down_uri):
  50                 HTMLParser.__init__(self)
  51                 self.down_uri = down_uri
  52                 self.depth = 0
  53                 self.parsing = False
  54                 self.subs = []
  55                 self.attr = None
  56                 self.attr_depth = 0
  57                 self.cur = None
  58                 self.in_script_style = False
  59
  60         def handle_starttag(self, tag, attrs):
  61                 attrs = dict(attrs)
  62                 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
  63                         self.cur = dict()
  64                         self.subs.append(self.cur)
  65                         self.parsing = True
  66                 if not self.parsing:
  67                         return
  68                 if tag == 'script' or tag == 'style':
  69                         self.in_script_style = True
  70                         return
  71                 if tag == 'div':
  72                         if attrs.get('id') == 'buscador_detalle':
  73                                 self.parsing = True
  74                         elif attrs.get('id') == 'buscador_detalle_sub':
  75                                 self.attr = 'desc'
  76                                 self.attr_depth = self.depth + 1
  77                                 self.cur[self.attr] = ''
  78                 elif tag == 'a':
  79                         if attrs.get('class') == 'titulo_menu_izq':
  80                                 self.attr = 'titulo'
  81                                 self.attr_depth = self.depth + 1
  82                                 self.cur[self.attr] = ''
  83                         elif attrs.get('href', '').startswith(self.down_uri):
  84                                 self.cur['url'] = attrs['href']
  85                 # br are usually not closed, so ignore them in depth calculation
  86                 if self.parsing and tag != 'br':
  87                         self.depth += 1
  88
  89         def handle_endtag(self, tag):
  90                 if self.parsing:
  91                         if tag == 'script' or tag == 'style':
  92                                 self.in_script_style = False
  93                                 return
  94                         if self.depth == self.attr_depth:
  95                                 self.attr = None
  96                                 self.attr_depth = 0
  97                         # see comment in handle_starttag()
  98                         if tag != 'br':
  99                                 self.depth -= 1
 100                 if self.depth == 0:
 101                         self.parsing = False
 102
 103         def handle_data(self, data):
 104                 if not self.parsing:
 105                         return
 106                 data = data.strip()
 107                 # Hack to handle comments in <script> <style> which don't end
 108                 # up in handle_comment(), so we just ignore the whole tags
 109                 if self.in_script_style:
 110                         return
 111                 if self.attr is not None and data:
 112                         self.cur[self.attr] += ' ' + data
 113                         if self.attr_depth == 0:
 114                                 self.cur[self.attr] = self.cur[self.attr].strip()
 115                                 self.attr = None
 116                                 self.attr_depth = 0
 117                 elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'):
 118                         self.attr = data[:-1].lower()
 119                         self.attr_depth = 0
 120                         self.cur[self.attr] = ''
 121                 elif data == 'Subido por:':
 122                         self.attr = 'autor'
 123                         self.attr_depth = 0
 124                         self.cur[self.attr] = ''
 125                 elif data == 'el':
 126                         self.attr = 'fecha'
 127                         self.attr_depth = 0
 128                         self.cur[self.attr] = ''
 129
 130
 131 def filter_subtitles(subs, filters):
 132         def is_good(sub, filter):
 133                 def is_any_good(sub, filter):
 134                         for value in sub.values():
 135                                 if value.lower().find(filter) >= 0:
 136                                         return True
 137
 138                 field = None
 139                 if len(filter) > 2 and filter[1] == ':':
 140                         field = filter[0]
 141                         filter = filter[2:]
 142                 filter = filter.lower()
 143
 144                 if field is None:
 145                         return is_any_good(sub, filter)
 146                 elif field == 't':
 147                         key = 'titulo'
 148                 elif field == 'd':
 149                         key = 'desc'
 150                 elif field == 'a':
 151                         key = 'autor'
 152                 elif field == 'f':
 153                         key = 'formato'
 154                 elif field == 'c':
 155                         key = 'comentarios'
 156                 elif field == 'C':
 157                         key = 'cds'
 158                 elif field == 'F':
 159                         key = 'fecha'
 160                 elif field == 'D':
 161                         key = 'downloads'
 162                 else:
 163                         # Not a recognizer field identifier, use the raw filter
 164                         return is_any_good(sub, field + ':' + filter)
 165
 166                 return sub[key].lower().find(filter) >= 0
 167
 168         if not filters:
 169                 return subs
 170
 171         result = []
 172         for sub in subs:
 173                 for filter in filters:
 174                         if not is_good(sub, filter):
 175                                 break
 176                 else:
 177                         result.append(sub)
 178         return result
 179
 180
 181 def subdivx_get_subs(query_str):
 182         page_number = 1
 183         subs = []
 184         while True:
 185                 query = SubDivXQuery(query_str, page_number)
 186                 url = urlopen(query.url)
 187                 parser = SubDivXHTMLParser(query.down_uri)
 188
 189                 try:
 190                         encoding = get_encoding(url.info())
 191                 except:
 192                         encoding = 'ascii'
 193
 194                 for line in url:
 195                         parser.feed(line.decode(encoding))
 196
 197                 url.close()
 198
 199                 if not parser.subs:
 200                         break
 201
 202                 subs.extend(parser.subs)
 203                 page_number += 1
 204
 205         return sorted(subs, key=lambda s: int(s['downloads']), reverse=True)
 206
 207
 208 def get_subs(query_str, filters):
 209         sub_exts = ('.srt', '.sub')
 210         zip_exts = ('application/zip',)
 211         rar_exts = ('application/rar', 'application/x-rar-compressed')
 212
 213         subs = subdivx_get_subs(query_str)
 214         subs = filter_subtitles(subs, filters)
 215
 216         for sub in subs:
 217                 print('''\
 218 - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
 219   %(desc)s
 220         DOWNLOADING ...
 221 ''' % sub)
 222                 continue
 223                 fname, headers = urlretrieve(sub['url'])
 224                 if 'Content-Type' in headers:
 225                         if headers['Content-Type'] in zip_exts:
 226                                 z = zipfile.ZipFile(fname, 'r')
 227                                 z.printdir()
 228                                 for fn in z.namelist():
 229                                         if fn.endswith(sub_exts):
 230                                                 if '..' in fn or fn.startswith('/'):
 231                                                         print('Dangerous file name:', fn)
 232                                                         continue
 233                                                 print('Extracting', fn, '...')
 234                                                 z.extract(fn)
 235                         elif headers['Content-Type'] in rar_exts:
 236                                 if subprocess.call(['rar', 'x', fname]) != 0:
 237                                         print('Error unraring file %s' % fname)
 238                         else:
 239                                 print('Unrecognized file type:', headers['Content-Type'])
 240                 else:
 241                         print('No Content-Type!')
 242
 243
 244 get_subs(sys.argv[1], sys.argv[2:])
 245