dictionary - Processing chunks of data - requests / Python -


i have huge files process , searching here found out requests library use. example of file (small one) processed that:

https://storage.googleapis.com/tlc-trip-data/2015/green_tripdata_2015-06.csv

this code process such data:

import requests import csv  def consumetaxidata(url):     """     given url, reads content , process data.     :param url: url readen.     :return: list of tuples in form (long, lat, hour).     """     print "processing", url     points = []      r = requests.get(url, stream=true)     keys = none     = 0     chunk in r.iter_content(chunk_size=1024):         if chunk:             if == 0:                 reader = csv.dictreader(chunk.splitlines(), delimiter=',')             else:                 reader = csv.dictreader(chunk.splitlines(), fieldnames=keys, delimiter=',')             line in reader:                 if == 0:                     keys = line.keys()                     print "keys", keys                  latitude = line.get('dropoff_latitude', none)                 if latitude none:                     latitude = line.get('dropoff_latitude', none)                  longitude = line.get('dropoff_longitude', none)                 if longitude none:                     longitude = line.get('dropoff_longitude', none)                  time = line.get('tpep_dropoff_datetime', none)                 if time none:                     time = line.get('lpep_dropoff_datetime', none)                 if time not none , latitude not none , longitude not none , \                    datetime.strptime(time, '%y-%m-%d %h:%m:%s') >= datetime.strptime(date, '%y-%m-%d'):                     time = roundtime(datetime.strptime(time, '%y-%m-%d %h:%m:%s'), roundto=60 * 60).hour                     points.append((longitude, latitude, time))                 i+=1      return points 

the first line of data contains field names and, i'm interested in 3 of them, though using dictreader. however, print statement give me this:

 keys ['vendorid', 'total_amount', 'lpep_pickup_datetime', 'passenger_count', 'payment_type', 'store_and_fwd_flag', 'pickup_latitude', 'trip_type ', 'lpep_dropoff_datetime', 'ratecodeid', 'trip_distance', 'fare_amount', 'pickup_longitude', 'dropoff_latitude', 'tolls_amount', 'improvement_surcharge', 'tip_amount', none, 'extra', 'mta_tax', 'ehail_fee', 'dropoff_longitude'] {'vendorid': '2', 'total_amount': '11.8', 'lpep_pickup_datetime': '2015-06-01 00:00:00', 'passenger_count': '1', 'payment_type': '2', 'store_and_fwd_flag': 'n', 'pickup_latitude': '40.881328582763672', 'trip_type ': '1', 'lpep_dropoff_datetime': '2015-06-01 00:09:32', 'ratecodeid': '1', 'trip_distance': '2.64', 'fare_amount': '10.5', 'pickup_longitude': '-73.878700256347656', 'dropoff_latitude': '40.884838104248047', 'tolls_amount': '0', 'improvement_surcharge': '0.3', 'tip_amount': '0', none: ['', ''], 'extra': '0.5', 'mta_tax': '0.5', 'ehail_fee': '', 'dropoff_longitude': '-73.838386535644531'} {'vendorid': '2', 'total_amount': '17.3', 'lpep_pickup_datetime': '2015-06-01 00:00:05', 'passenger_count': '1', 'payment_type': '2', 'store_and_fwd_flag': 'n', 'pickup_latitude': '40.876182556152344', 'trip_type ': '1', 'lpep_dropoff_datetime': '2015-06-01 00:12:41', 'ratecodeid': '1', 'trip_distance': '4.79', 'fare_amount': '16', 'pickup_longitude': '-73.906356811523438', 'dropoff_latitude': '40.830490112304688', 'tolls_amount': '0', 'improvement_surcharge': '0.3', 'tip_amount': '0', none: ['', ''], 'extra': '0.5', 'mta_tax': '0.5', 'ehail_fee': '', 'dropoff_longitude': '-73.944488525390625'} {'vendorid': '2', 'total_amount': '10.3', 'lpep_pickup_datetime': '2015-06-01 00:00:09', 'passenger_count': '1', 'payment_type': '2', 'store_and_fwd_flag': 'n', 'pickup_latitude': '40.747196197509766', 'trip_type ': '1', 'lpep_dropoff_datetime': '2015-06-01 00:11:29', 'ratecodeid': '1', 'trip_distance': '1.45', 'fare_amount': '9', 'pickup_longitude': '-73.887863159179688', 'dropoff_latitude': '40.738815307617188', 'tolls_amount': '0', 'improvement_surcharge': '0.3', 'tip_amount': '0', none: ['', ''], 'extra': '0.5', 'mta_tax': '0.5', 'ehail_fee': '', 'dropoff_longitude': '-73.888786315917969'} {'vendorid': '2', 'total_amount': '5.8', 'lpep_pickup_datetime': '2015-06-01 00:00:26', 'passenger_count': '1', 'payment_type': '2', 'store_and_fwd_flag': 'n', 'pickup_latitude': '40.770065307617187', 'trip_type ': '1', 'lpep_dropoff_datetime': '2015-06-01 00:03:51', 'ratecodeid': '1', 'trip_distance': '.74', 'fare_amount': '4.5', 'pickup_longitude': '-73.917800903320312', 'dropoff_latitude': '40.766143798828125', 'tolls_amount': '0', 'improvement_surcharge': '0.3', 'tip_amount': '0', none: ['', ''], 'extra': '0.5', 'mta_tax': '0.5', 'ehail_fee': '', 'dropoff_longitude': '-73.907890319824219'} {'trip_distance': none, 'vendorid': '1', 'improvement_surcharge': none, 'tip_amount': none, 'total_amount': none, 'lpep_pickup_datetime': '2015-06-01 00:00:18', 'extra': none, 'pickup_latitude': '40.717', 'ehail_fee': none, 'fare_amount': none, 'pickup_longitude': '-73.956329345703125', 'tolls_amount': none, 'dropoff_longitude': none, 'passenger_count': none, 'payment_type': none, 'mta_tax': none, 'lpep_dropoff_datetime': '2015-06-01 00:04:31', 'store_and_fwd_flag': 'n', 'ratecodeid': '1', 'dropoff_latitude': none, 'trip_type ': none} ['121124267578,-73.950599670410156,40.723434448242187,1,.80,5,0.5,0.5,1.25,0,,0.3,7.55,1,1,,', '2,2015-06-01 00:00:16,2015-06-01 00:10:29,n,1,-73.939163208007812,40.816555023193359,-73.938468933105469,40.796218872070313,1,1.94,9.5,0.5,0.5,0,0,,0.3,10.8,2,1,,', '2,2015-06-01 00:00:29,2015-06-01 00:26:47,n,1,-73.941329956054687,40.813583374023438,-73.918571472167969,40.811511993408203,1,6.26,22.5,0.5,0.5,0,0,,0.3,23.8,2,1,,', '2,2015-06-01 00:01:15,2015-06-01 00:04:11,n,1,-73.997383117675781,40.674507141113281,-73.98590087890625,40.67755126953125,1,.90,5,0.5,0.5,1.26,0,,0.3,7.56,1,1,,', '2,2015-06-01 00:00:39,2015-06-01 00:06:35,n,1,-73.891006469726563,40.746994018554687,-73.880416870117187,40.749176025390625,1,.71,5.5,0.5,0.5,0,0,,0.3,6.8,2,1,,', '2,2015-06-01 00:00:34,2015-06-01 00:10:13,n,1,-73.969017028808594,40.693115234375,-73.950355529785156,40.706508636474609,2,1.96,9,0.5,0.5,0,0,,0.3,10.3,2,1,,', '2,2015-06-01 00:01:06,2015-06-01 00:32:00,n,1,-73.928153991699219,40.695011138916016,-73.954338073730469,40.773025512695'] {'vendorid': '121124267578', 'total_amount': '-73.950599670410156', 'lpep_pickup_datetime': '40.723434448242187', 'passenger_count': '1', 'payment_type': '.80', 'store_and_fwd_flag': '5', 'pickup_latitude': '0.5', 'trip_type ': '0.5', 'lpep_dropoff_datetime': '1.25', 'ratecodeid': '0', 'trip_distance': '', 'fare_amount': '0.3', 'pickup_longitude': '7.55', 'dropoff_latitude': '1', 'tolls_amount': '1', 'improvement_surcharge': '', 'tip_amount': '', none: none, 'ehail_fee': none, 'mta_tax': none, 'extra': none, 'dropoff_longitude': none} traceback (most recent call last):   file "/users/paulaceccon/documents/projetos/nycnoise/scripts/noiseinference.py", line 490, in <module>     taxi_dropoffs = gettaxitrips(date)   file "/users/paulaceccon/documents/projetos/nycnoise/scripts/noiseinference.py", line 300, in gettaxitrips {'vendorid': '2', 'total_amount': '2015-06-01 00:00:16', 'lpep_pickup_datetime': '2015-06-01 00:10:29', 'passenger_count': 'n', 'payment_type': '1', 'store_and_fwd_flag': '-73.939163208007812', 'pickup_latitude': '40.816555023193359', 'trip_type ': '-73.938468933105469', 'lpep_dropoff_datetime': '40.796218872070313', 'ratecodeid': '1', 'trip_distance': '1.94', 'fare_amount': '9.5', 'pickup_longitude': '0.5', 'dropoff_latitude': '0.5', 'tolls_amount': '0', 'improvement_surcharge': '0', 'tip_amount': '', none: [''], 'ehail_fee': '1', 'mta_tax': '2', 'extra': '10.8', 'dropoff_longitude': ''}     result = pool.map(consumetaxidata, data)   file "/usr/local/cellar/python/2.7.11/frameworks/python.framework/versions/2.7/lib/python2.7/multiprocessing/pool.py", line 251, in map     return self.map_async(func, iterable, chunksize).get()   file "/usr/local/cellar/python/2.7.11/frameworks/python.framework/versions/2.7/lib/python2.7/multiprocessing/pool.py", line 567, in     raise self._value valueerror: time data '40.796218872070313' not match format '%y-%m-%d %h:%m:%s' 

i'm wondering what's causing this, why got messed @ 5th dict print line, , how solve it.

you should check

from dateutil.parser import parse  def is_date(string):     try:          parse(string)         return true     except valueerror:         return false 

then

if is_date(time) , latitude not none , longitude not none:    if datetime.strptime(time, '%y-%m-%d %h:%m:%s') >= datetime.strptime(date, '%y-%m-%d'):         time = roundtime(datetime.strptime(time, '%y-%m-%d %h:%m:%s'), roundto=60 * 60).hour         points.append((longitude, latitude, time))    

it solve error

valueerror: time data '40.796218872070313' not match format '%y-%m-%d %h:%m:%s' 

Comments

Popular posts from this blog

ruby - Trying to change last to "x"s to 23 -

jquery - Clone last and append item to closest class -

c - Unrecognised emulation mode: elf_i386 on MinGW32 -