需要序列化个略大的文本文件到数据库 关键的几个点
- 用executemany代替execute
- 开启此参数 PRAGMA synchronous = OFF
- 放垃圾码
import os
import sqlite3
import time
def exeTime(func):
def newFunc(*args, **args2):
t0 = time.time()
print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
back = func(*args, **args2)
print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
return back
return newFunc
@exeTime
def func():
# Create names_nodes_scientific.db
print ("Creating names_nodes_scientific.db...")
conn = sqlite3.connect('names_nodes_scientific.db')
c = conn.cursor()
c.execute("CREATE TABLE names (taxid INTEGER PRIMARY KEY, name TEXT)")
c.execute("PRAGMA synchronous = OFF")
c.execute("BEGIN TRANSACTION")
with open('test.dmp', 'r') as map_file:
lst = []
for line in map_file:
line = line.split("|")
taxid = line[0].strip()
name = line[1].strip()
lst.append((taxid, name))
c.executemany("INSERT INTO names VALUES (?,?)", lst)
conn.commit()
conn.close()
class Main(object):
def __init__(self):
pass
if __name__ == '__main__':
os.remove("names_nodes_scientific.db")
func()
截取80兆左右的文件做测试, 测试结果比优化前时间缩短了一半(12s降低到5.48s), 下一步考虑利用linux tmpfs的特性