Toolserver:Python/MySQL bulk load
This page was moved from the Toolserver wiki.
Toolserver has been replaced by Toolforge. As such, the instructions here may no longer work, but may still be of historical interest.
Please help by updating examples, links, template links, etc. If a page is still relevant, move it to a normal title and leave a redirect.
# Google Maps link extractor
# This demonstrates LOAD DATA LOCAL INFILE in an ETL operations in python using
# oursql in conjunction with a Minimum bounding rectangle spatial (MBR) index.
#
# Dispenser 2011, Public domain
import os, re, oursql
mysql_byte_escape = dict((chr(i), chr(i)) for i in range(256))
mysql_byte_escape.update({
# Bytes escape
b'\0': b'\\0',
b'\b': b'\\b',
b'\n': b'\\n',
b'\r': b'\\r',
b'\t': b'\\t',
b'\x16': b'\\Z',
b'\\': b'\\\\',
})
def MySQL_format(s, encoding='utf-8'):
if s is None:
return b'\\N'
elif isinstance(s, bytes):
return b''.join(map(mysql_byte_escape.__getitem__, s))
elif isinstance(s, unicode):
return b''.join(map(mysql_byte_escape.__getitem__, s.encode(encoding)))
elif isinstance(s, bool):
return b'1' if s else b'0'
elif isinstance(s, (int, long)):
return bytes(s)
elif isinstance(s, float):
# repr() does not round unlike str()
return repr(s)
else:
raise TypeError
def main():
# Database_name.target_table_name
spatial_table = "u_dispenser_p.gmap_coordinates"
# Temporary file, can also be imported into Excel
filename='./gmaps_extract.tsv'
wiki = 'enwiki_p'
conn = oursql.connect(
db=wiki,
host=wiki.replace('_', '-')+'.userdb.toolserver.org',
read_default_file=os.path.expanduser("~/.my.cnf"),
local_infile=True # IMPORTANT
)
# Extract
cursor = conn.cursor()
cursor.execute("SELECT el_from, el_to FROM externallinks WHERE el_to LIKE ?", ("http://maps.google.com/%",))
# Transform
with open(filename, 'wb') as f:
query_location_R = re.compile(r'([&?]q=([^&]*)|).*?[&?]s?ll=([0-9.+-]+),([0-9.+-]+)')
for el_from, el_to in cursor:
m = query_location_R.search(el_to)
if not m:continue
params = (el_from, float(m.group(3)), float(m.group(4)), m.group(2) and m.group(2)[:255],)
# data is escaped using our MySQL_format so tabs can be used to separate columns and
# newline separate rows. Some special characters are also escaped.
f.write(b'\t'.join((MySQL_format(s) for s in params)) )
f.write(b'\n')
# Load
cursor.execute("""
CREATE TABLE """+spatial_table+""" (
gm_from INT(10) UNSIGNED NOT NULL,
gm_lat DOUBLE NOT NULL,
gm_lng DOUBLE NOT NULL,
gm_query VARBINARY(255) NULL,
gm_point POINT NOT NULL,
KEY gm_from (gm_from),
SPATIAL KEY gm_point (gm_point)
) ENGINE = MyISAM;
""")
cursor.execute("""
LOAD DATA LOCAL INFILE '"""+filename+"""'
INTO TABLE """+spatial_table+"""
/* Here it the same as the default. See manual more formats like CSV */
FIELDS TERMINATED BY '\\t'
LINES TERMINATED BY '\\n'
(gm_from,gm_lat,gm_lng,gm_query)
/* MySQL's internal geometery is different from WkT and WkB, so we create it on the fly */
SET gm_point = GeomFromText(CONCAT('POINT(',gm_lat,' ',gm_lng,')'))
""")
if __name__ == "__main__":
main()