X-Git-Url: https://git.llucax.com/software/subdivxget.git/blobdiff_plain/a53f6ed5e6cbd150f0fd27dbaae850f57eff33fe..f274666ba6db3400c33faf64aa40c3d2ba2d5156:/subdivxget?ds=sidebyside

diff --git a/subdivxget b/subdivxget
index aaa3d44..bd6fc6e 100755
--- a/subdivxget
+++ b/subdivxget
@@ -7,12 +7,13 @@ import subprocess
 import HTMLParser
 
 class SubDivXQuery:
-	def __init__(self, to_search):
+	def __init__(self, to_search, page_number):
 		self.host = "www.subdivx.com"
 		self.page = "/index.php"
 		self.down_page = "/bajar.php"
 		self.query = dict(
 			buscar = to_search,
+			pg = page_number,
 			accion = 5,
 			masdesc = '',
 			subtitulos = 1,
@@ -42,6 +43,8 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser):
 		self.parsing = False
 		self.subs = []
 		self.attr = None
+		self.cur = None
+		self.in_script_style = False
 
 	def handle_starttag(self, tag, attrs):
 		attrs = dict(attrs)
@@ -51,6 +54,9 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser):
 			self.parsing = True
 		if not self.parsing:
 			return
+		if tag == 'script' or tag == 'style':
+			self.in_script_style = True
+			return
 		if tag == 'div':
 			if attrs.get('id') == 'buscador_detalle':
 				self.parsing = True
@@ -66,41 +72,60 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser):
 
 	def handle_endtag(self, tag):
 		if self.parsing:
+			if tag == 'script' or tag == 'style':
+				self.in_script_style = False
+				return
 			self.depth -= 1
 		if self.depth == 0:
 			self.parsing = False
 
 	def handle_data(self, data):
-		if self.parsing:
-			data = data.strip()
-			if self.attr is not None and data:
-				self.cur[self.attr] = data
-				self.attr = None
-			elif data in ('Downloads:', 'Cds:', 'Comentarios:',
-					'Formato:'):
-				self.attr = data[:-1].lower()
-			elif data == 'Subido por:':
-				self.attr = 'autor'
-			elif data == 'el':
-				self.attr = 'fecha'
+		if not self.parsing:
+			return
+		data = data.strip()
+		# Hack to handle comments in <script> <style> which don't end
+		# up in handle_comment(), so we just ignore the whole tags
+		if self.in_script_style:
+			return
+		if self.attr is not None and data:
+			self.cur[self.attr] = data
+			self.attr = None
+		elif data in ('Downloads:', 'Cds:', 'Comentarios:',
+				'Formato:'):
+			self.attr = data[:-1].lower()
+		elif data == 'Subido por:':
+			self.attr = 'autor'
+		elif data == 'el':
+			self.attr = 'fecha'
 
 
-def get_subs(query_str):
-	query = SubDivXQuery(query_str)
+def subdivx_get_subs(query_str):
+	page_number = 1
+	subs = []
+	while True:
+		query = SubDivXQuery(query_str, page_number)
+		url = urllib.urlopen(query.url)
+		parser = SubDivXHTMLParser(query.down_uri)
+
+		for line in url:
+			parser.feed(line)
 
-	url = urllib.urlopen(query.url)
+		url.close()
 
-	parser = SubDivXHTMLParser(query.down_uri)
+		if not parser.subs:
+			break
 
-	for line in url:
-		parser.feed(line)
+		subs.extend(parser.subs)
+		page_number += 1
 
-	url.close()
+	return sorted(subs, key=lambda s: int(s['downloads']), reverse=True)
 
+
+def get_subs(query_str):
 	zip_exts = ('application/zip',)
 	rar_exts = ('application/rar', 'application/x-rar-compressed')
 
-	for sub in sorted(parser.subs, key=lambda s: int(s['downloads']), reverse=True):
+	for sub in subdivx_get_subs(query_str):
 		print '''\
 	- %(title)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
 	  %(desc)s