Reading from a SQLite DB into Pandas

01 Mar 2020

This note demonstrates reading from an exising SQLite database.

import pandas as pd
import sqlite3

Using Bash, it is possible to print out information about the database directly in Jupyter.

%%bash
cd reading-from-sqlite-db-to-pandas
sqlite3 flights.db
.tables

airlines  airports  routes

Connect to the Database and Read from It

The following demonstrates opening and reading from each of the tables listed above.

Airlines

conn = sqlite3.connect('reading-from-sqlite-db-to-pandas/flights.db')
airline_string = '''SELECT * 
                    FROM airlines'''

airlines = pd.read_sql(airline_string,
                       conn,
                       index_col='index')

conn.close()

%%bash
cd reading-from-sqlite-db-to-pandas
sqlite3 flights.db
.schema airlines

CREATE TABLE airlines (
[index] INTEGER,
  [id] TEXT,
  [name] TEXT,
  [alias] TEXT,
  [iata] TEXT,
  [icao] TEXT,
  [callsign] TEXT,
  [country] TEXT,
  [active] TEXT
);
CREATE INDEX ix_airlines_index ON airlines ([index]);

airlines.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6048 entries, 0 to 6047
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        6048 non-null   object
 1   name      6048 non-null   object
 2   alias     5615 non-null   object
 3   iata      1461 non-null   object
 4   icao      5961 non-null   object
 5   callsign  5305 non-null   object
 6   country   6033 non-null   object
 7   active    6048 non-null   object
dtypes: object(8)
memory usage: 425.2+ KB

airlines.head()

	id	name	alias	iata	icao	callsign	country	active
index
0	1	Private flight	\N	-	None	None	None	Y
1	2	135 Airways	\N	None	GNL	GENERAL	United States	N
2	3	1Time Airline	\N	1T	RNX	NEXTIME	South Africa	Y
3	4	2 Sqn No 1 Elementary Flying Training School	\N	None	WYT	None	United Kingdom	N
4	5	213 Flight Unit	\N	None	TFU	None	Russia	N

Airports

conn = sqlite3.connect('reading-from-sqlite-db-to-pandas/flights.db')
airports_string = '''SELECT * 
                     FROM airports'''

airports = pd.read_sql(airports_string,
                       conn,
                       index_col='index')

conn.close()

%%bash
cd reading-from-sqlite-db-to-pandas
sqlite3 flights.db
.schema airports

CREATE TABLE airports (
[index] INTEGER,
  [id] TEXT,
  [name] TEXT,
  [city] TEXT,
  [country] TEXT,
  [code] TEXT,
  [icao] TEXT,
  [latitude] TEXT,
  [longitude] TEXT,
  [altitude] TEXT,
  [offset] TEXT,
  [dst] TEXT,
  [timezone] TEXT
);
CREATE INDEX ix_airports_index ON airports ([index]);

airports.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8107 entries, 0 to 8106
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         8107 non-null   object
 1   name       8107 non-null   object
 2   city       8107 non-null   object
 3   country    8107 non-null   object
 4   code       5880 non-null   object
 5   icao       8043 non-null   object
 6   latitude   8107 non-null   object
 7   longitude  8107 non-null   object
 8   altitude   8107 non-null   object
 9   offset     8107 non-null   object
 10  dst        8107 non-null   object
 11  timezone   8107 non-null   object
dtypes: object(12)
memory usage: 823.4+ KB

airports.head()

	id	name	city	country	code	icao	latitude	longitude	altitude	offset	dst	timezone
index
0	1	Goroka	Goroka	Papua New Guinea	GKA	AYGA	-6.081689	145.391881	5282	10	U	Pacific/Port_Moresby
1	2	Madang	Madang	Papua New Guinea	MAG	AYMD	-5.207083	145.7887	20	10	U	Pacific/Port_Moresby
2	3	Mount Hagen	Mount Hagen	Papua New Guinea	HGU	AYMH	-5.826789	144.295861	5388	10	U	Pacific/Port_Moresby
3	4	Nadzab	Nadzab	Papua New Guinea	LAE	AYNZ	-6.569828	146.726242	239	10	U	Pacific/Port_Moresby
4	5	Port Moresby Jacksons Intl	Port Moresby	Papua New Guinea	POM	AYPY	-9.443383	147.22005	146	10	U	Pacific/Port_Moresby

Routes

conn = sqlite3.connect('reading-from-sqlite-db-to-pandas/flights.db')
routes_string = '''SELECT * 
                   FROM routes'''

routes = pd.read_sql(routes_string,
                     conn,
                     index_col='index')

conn.close()

%%bash
cd reading-from-sqlite-db-to-pandas
sqlite3 flights.db
.schema routes

CREATE TABLE routes (
[index] INTEGER,
  [airline] TEXT,
  [airline_id] TEXT,
  [source] TEXT,
  [source_id] TEXT,
  [dest] TEXT,
  [dest_id] TEXT,
  [codeshare] TEXT,
  [stops] TEXT,
  [equipment] TEXT
);
CREATE INDEX ix_routes_index ON routes ([index]);

routes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67663 entries, 0 to 67662
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   airline     67663 non-null  object
 1   airline_id  67663 non-null  object
 2   source      67663 non-null  object
 3   source_id   67663 non-null  object
 4   dest        67663 non-null  object
 5   dest_id     67663 non-null  object
 6   codeshare   14597 non-null  object
 7   stops       67663 non-null  object
 8   equipment   67645 non-null  object
dtypes: object(9)
memory usage: 5.2+ MB

routes.head()

	airline	airline_id	source	source_id	dest	dest_id	codeshare	stops	equipment
index
0	2B	410	AER	2965	KZN	2990	None	0	CR2
1	2B	410	ASF	2966	KZN	2990	None	0	CR2
2	2B	410	ASF	2966	MRV	2962	None	0	CR2
3	2B	410	CEK	2968	KZN	2990	None	0	CR2
4	2B	410	CEK	2968	OVB	4078	None	0	CR2

Some of this content, including the flights database, was taken from this article from dataquest.io.