import re, urllib, sys from htmlentitydefs import name2codepoint as n2cp def substitute_entity(match): ent = match.group(2) if match.group(1) == "#": return unichr(int(ent)) else: cp = n2cp.get(ent) if cp: return unichr(cp) else: return match.group() def decode_htmlentities(string): entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});") return entity_re.subn(substitute_entity, string)[0] def _geturl(url): return urllib.urlopen(url).read() def getquotes(market): if not market in ['V', 'T']: raise "Market must be either V or T" qre = re.compile('QuoteSymbol_1=([^"]+)">([^<]+)<', ) allquotes = [] prevq = None for c in [chr(c) for c in range(ord('a'), ord('z')+1)] + range(0, 10): page = 1 while True: url = "http://www.tsx.com/HttpController?GetPage=ListedCompaniesViewPage&SearchCriteria=Name&SearchKeyword=%s&SearchType=StartWith&Page=%d&SearchIsMarket=Yes&Market=%s&Language=en" % (c, page, market) data = _geturl(url) quotes = qre.findall(data) if quotes == prevq: break for q in quotes: yield (q[0], decode_htmlentities(q[1]), market) prevq = quotes page += 1 def csv(q): return ";".join(q) if __name__ == "__main__": for q in getquotes('V'): print csv(q) for q in getquotes('T'): print csv(q)