pandas 链接 PostgreSQL 数据库

pandas 链接 PostgreSQL 数据库

1 pandas 读取 postgresql 数据库

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
import psycopg2
import sys

con = None

try:
    con = psycopg2.connect("dbname='postgres' user='postgres' password='123'")
    sql = "SELECT * FROM Cars"
    df = pd.read_sql(sql,con)
    print('df')
    print(df)
except psycopg2.DatabaseError as e:
    print('Error %s' % e)
    sys.exit(1)
finally:
    if con:
        con.close()

2. 使用index_col参数来规定将那一列数据设置为 index

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
import psycopg2
import sys

con = None

try:
    con = psycopg2.connect("dbname='postgres' user='postgres' password='123'")
    sql = "SELECT * FROM Cars"
    df = pd.read_sql(sql,con, index_col='id')
    print('df with id as index')
    print(df)
except psycopg2.DatabaseError as e:
    print('Error %s' % e)
    sys.exit(1)
finally:
    if con:
        con.close()

3. pandas 写入 PostgreSQL 数据库

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
import psycopg2
import sys
from sqlalchemy import create_engine

con = None

df1 = pd.DataFrame({'id':[5,6], 'name':['Audi', 'BMW'], 'price':[11111,22222]})

try:
    con = psycopg2.connect("dbname='postgres' user='postgres' password='123'")
    cur = con.cursor()

    # engine = create_engine('postgresql://postgres:123@localhost:5432/postgres', echo=False) # work
    # engine = create_engine('postgresql://postgres:123@localhost/postgres', echo=False) # work
    engine = create_engine('postgresql://postgres:123@localhost', echo=False) # work
    # engine = create_engine('mysql+mysqlconnector://[user]:[pass]@[host]:[port]/[schema]', echo=False)
    # 以上为引擎创建格式
    df1.to_sql(name='cars', con=engine, if_exists='append', index=False, chunksize=10000)

    sql = "SELECT * FROM Cars"
    df = pd.read_sql(sql,con, index_col='id')
    print('df')
    print(df)
except psycopg2.DatabaseError as e:
    print('Error %s' % e)
    sys.exit(1)
finally:
    if con:
        con.close()

可以通过指定chunksize参数的方式来进行大批量插入,
pandas 会自动将数据拆分成 chunksize 大小的数据块进行批量插入,其实原理类似于我在上面使用的循环插入法.
在不指定这个参数的时候,pandas 会一次性插入 dataframe 中的所有记录,
mysql 如果服务器不能响应这么大数据量的插入,就会出现上述错误.附上正确的插入姿势(针对 mysql):

pd.io.sql.to_sql(df,'xxx',zh_con,flavor='mysql',if_exists='append',index=Falsechunksize=10000)

此处的chunksize可以自定义。
http://stackoverflow.com/questions/31837979/pandas-sql-chunksize