Parse Dates Efficiently

29/05/2018

import os; import pandas as pd; import numpy  as np

Import data from a tsv file

df = pd.read_csv("flight_edges.tsv", sep='\t', header=None)
df.columns = [
    'Origin','Destination', 
    'Origin City', 'Destination City', 
    'Passengers','Seats','Flights','Distance','Fly Date', 
    'Origin Population', 'Destination Population'
]

df.head()
Origin Destination Origin City Destination City Passengers Seats Flights Distance Fly Date Origin Population Destination Population
0 MHK AMW Manhattan, KS Ames, IA 21 30 1 254.0 200810 122049 86219
1 EUG RDM Eugene, OR Bend, OR 41 396 22 103.0 199011 284093 76034
2 EUG RDM Eugene, OR Bend, OR 88 342 19 103.0 199012 284093 76034
3 EUG RDM Eugene, OR Bend, OR 11 72 4 103.0 199010 284093 76034
4 MFR RDM Medford, OR Bend, OR 0 18 1 156.0 199002 147300 76034

Parse Dates (efficiently)

date_parser = lambda x: pd.datetime.strptime(str(x), '%Y%m')
# Slow way (30 seconds)
df['Date']  = df['Fly Date'].apply(date_parser)
# Faster way (1.5 seconds)

df['date_index'] = df['Fly Date']
dates = df.groupby(['date_index']).first()['Fly Date'].apply(date_parser)

df = df.set_index([ 'date_index' ])
df['Date'] = dates
df = df.reset_index()

df.head()
date_index Origin Destination Origin City Destination City Passengers Seats Flights Distance Fly Date Origin Population Destination Population Date
0 200810 MHK AMW Manhattan, KS Ames, IA 21 30 1 254.0 200810 122049 86219 2008-10-01
1 199011 EUG RDM Eugene, OR Bend, OR 41 396 22 103.0 199011 284093 76034 1990-11-01
2 199012 EUG RDM Eugene, OR Bend, OR 88 342 19 103.0 199012 284093 76034 1990-12-01
3 199010 EUG RDM Eugene, OR Bend, OR 11 72 4 103.0 199010 284093 76034 1990-10-01
4 199002 MFR RDM Medford, OR Bend, OR 0 18 1 156.0 199002 147300 76034 1990-02-01



Author: Andrea Barbon


Back