import re, urllib, sys
from htmlentitydefs import name2codepoint as n2cp
def substitute_entity(match):
ent = match.group(2)
if match.group(1) == "#":
return unichr(int(ent))
else:
cp = n2cp.get(ent)
if cp:
return unichr(cp)
else:
return match.group()
def decode_htmlentities(string):
entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
return entity_re.subn(substitute_entity, string)[0]
def _geturl(url):
return urllib.urlopen(url).read()
def getquotes(market):
if not market in ['V', 'T']:
raise "Market must be either V or T"
qre = re.compile('QuoteSymbol_1=([^"]+)">([^<]+)<', )
allquotes = []
prevq = None
for c in [chr(c) for c in range(ord('a'), ord('z')+1)] + range(0, 10):
page = 1
while True:
url = "http://www.tsx.com/HttpController?GetPage=ListedCompaniesViewPage&SearchCriteria=Name&SearchKeyword=%s&SearchType=StartWith&Page=%d&SearchIsMarket=Yes&Market=%s&Language=en" % (c, page, market)
data = _geturl(url)
quotes = qre.findall(data)
if quotes == prevq:
break
for q in quotes:
yield (q[0], decode_htmlentities(q[1]), market)
prevq = quotes
page += 1
def csv(q):
return ";".join(q)
if __name__ == "__main__":
for q in getquotes('V'): print csv(q)
for q in getquotes('T'): print csv(q)