dictionary - Processing chunks of data - requests / Python -
i have huge files process , searching here found out requests
library use. example of file (small one) processed that:
https://storage.googleapis.com/tlc-trip-data/2015/green_tripdata_2015-06.csv
this code process such data:
import requests import csv def consumetaxidata(url): """ given url, reads content , process data. :param url: url readen. :return: list of tuples in form (long, lat, hour). """ print "processing", url points = [] r = requests.get(url, stream=true) keys = none = 0 chunk in r.iter_content(chunk_size=1024): if chunk: if == 0: reader = csv.dictreader(chunk.splitlines(), delimiter=',') else: reader = csv.dictreader(chunk.splitlines(), fieldnames=keys, delimiter=',') line in reader: if == 0: keys = line.keys() print "keys", keys latitude = line.get('dropoff_latitude', none) if latitude none: latitude = line.get('dropoff_latitude', none) longitude = line.get('dropoff_longitude', none) if longitude none: longitude = line.get('dropoff_longitude', none) time = line.get('tpep_dropoff_datetime', none) if time none: time = line.get('lpep_dropoff_datetime', none) if time not none , latitude not none , longitude not none , \ datetime.strptime(time, '%y-%m-%d %h:%m:%s') >= datetime.strptime(date, '%y-%m-%d'): time = roundtime(datetime.strptime(time, '%y-%m-%d %h:%m:%s'), roundto=60 * 60).hour points.append((longitude, latitude, time)) i+=1 return points
the first line of data contains field names and, i'm interested in 3 of them, though using dictreader
. however, print statement give me this:
keys ['vendorid', 'total_amount', 'lpep_pickup_datetime', 'passenger_count', 'payment_type', 'store_and_fwd_flag', 'pickup_latitude', 'trip_type ', 'lpep_dropoff_datetime', 'ratecodeid', 'trip_distance', 'fare_amount', 'pickup_longitude', 'dropoff_latitude', 'tolls_amount', 'improvement_surcharge', 'tip_amount', none, 'extra', 'mta_tax', 'ehail_fee', 'dropoff_longitude'] {'vendorid': '2', 'total_amount': '11.8', 'lpep_pickup_datetime': '2015-06-01 00:00:00', 'passenger_count': '1', 'payment_type': '2', 'store_and_fwd_flag': 'n', 'pickup_latitude': '40.881328582763672', 'trip_type ': '1', 'lpep_dropoff_datetime': '2015-06-01 00:09:32', 'ratecodeid': '1', 'trip_distance': '2.64', 'fare_amount': '10.5', 'pickup_longitude': '-73.878700256347656', 'dropoff_latitude': '40.884838104248047', 'tolls_amount': '0', 'improvement_surcharge': '0.3', 'tip_amount': '0', none: ['', ''], 'extra': '0.5', 'mta_tax': '0.5', 'ehail_fee': '', 'dropoff_longitude': '-73.838386535644531'} {'vendorid': '2', 'total_amount': '17.3', 'lpep_pickup_datetime': '2015-06-01 00:00:05', 'passenger_count': '1', 'payment_type': '2', 'store_and_fwd_flag': 'n', 'pickup_latitude': '40.876182556152344', 'trip_type ': '1', 'lpep_dropoff_datetime': '2015-06-01 00:12:41', 'ratecodeid': '1', 'trip_distance': '4.79', 'fare_amount': '16', 'pickup_longitude': '-73.906356811523438', 'dropoff_latitude': '40.830490112304688', 'tolls_amount': '0', 'improvement_surcharge': '0.3', 'tip_amount': '0', none: ['', ''], 'extra': '0.5', 'mta_tax': '0.5', 'ehail_fee': '', 'dropoff_longitude': '-73.944488525390625'} {'vendorid': '2', 'total_amount': '10.3', 'lpep_pickup_datetime': '2015-06-01 00:00:09', 'passenger_count': '1', 'payment_type': '2', 'store_and_fwd_flag': 'n', 'pickup_latitude': '40.747196197509766', 'trip_type ': '1', 'lpep_dropoff_datetime': '2015-06-01 00:11:29', 'ratecodeid': '1', 'trip_distance': '1.45', 'fare_amount': '9', 'pickup_longitude': '-73.887863159179688', 'dropoff_latitude': '40.738815307617188', 'tolls_amount': '0', 'improvement_surcharge': '0.3', 'tip_amount': '0', none: ['', ''], 'extra': '0.5', 'mta_tax': '0.5', 'ehail_fee': '', 'dropoff_longitude': '-73.888786315917969'} {'vendorid': '2', 'total_amount': '5.8', 'lpep_pickup_datetime': '2015-06-01 00:00:26', 'passenger_count': '1', 'payment_type': '2', 'store_and_fwd_flag': 'n', 'pickup_latitude': '40.770065307617187', 'trip_type ': '1', 'lpep_dropoff_datetime': '2015-06-01 00:03:51', 'ratecodeid': '1', 'trip_distance': '.74', 'fare_amount': '4.5', 'pickup_longitude': '-73.917800903320312', 'dropoff_latitude': '40.766143798828125', 'tolls_amount': '0', 'improvement_surcharge': '0.3', 'tip_amount': '0', none: ['', ''], 'extra': '0.5', 'mta_tax': '0.5', 'ehail_fee': '', 'dropoff_longitude': '-73.907890319824219'} {'trip_distance': none, 'vendorid': '1', 'improvement_surcharge': none, 'tip_amount': none, 'total_amount': none, 'lpep_pickup_datetime': '2015-06-01 00:00:18', 'extra': none, 'pickup_latitude': '40.717', 'ehail_fee': none, 'fare_amount': none, 'pickup_longitude': '-73.956329345703125', 'tolls_amount': none, 'dropoff_longitude': none, 'passenger_count': none, 'payment_type': none, 'mta_tax': none, 'lpep_dropoff_datetime': '2015-06-01 00:04:31', 'store_and_fwd_flag': 'n', 'ratecodeid': '1', 'dropoff_latitude': none, 'trip_type ': none} ['121124267578,-73.950599670410156,40.723434448242187,1,.80,5,0.5,0.5,1.25,0,,0.3,7.55,1,1,,', '2,2015-06-01 00:00:16,2015-06-01 00:10:29,n,1,-73.939163208007812,40.816555023193359,-73.938468933105469,40.796218872070313,1,1.94,9.5,0.5,0.5,0,0,,0.3,10.8,2,1,,', '2,2015-06-01 00:00:29,2015-06-01 00:26:47,n,1,-73.941329956054687,40.813583374023438,-73.918571472167969,40.811511993408203,1,6.26,22.5,0.5,0.5,0,0,,0.3,23.8,2,1,,', '2,2015-06-01 00:01:15,2015-06-01 00:04:11,n,1,-73.997383117675781,40.674507141113281,-73.98590087890625,40.67755126953125,1,.90,5,0.5,0.5,1.26,0,,0.3,7.56,1,1,,', '2,2015-06-01 00:00:39,2015-06-01 00:06:35,n,1,-73.891006469726563,40.746994018554687,-73.880416870117187,40.749176025390625,1,.71,5.5,0.5,0.5,0,0,,0.3,6.8,2,1,,', '2,2015-06-01 00:00:34,2015-06-01 00:10:13,n,1,-73.969017028808594,40.693115234375,-73.950355529785156,40.706508636474609,2,1.96,9,0.5,0.5,0,0,,0.3,10.3,2,1,,', '2,2015-06-01 00:01:06,2015-06-01 00:32:00,n,1,-73.928153991699219,40.695011138916016,-73.954338073730469,40.773025512695'] {'vendorid': '121124267578', 'total_amount': '-73.950599670410156', 'lpep_pickup_datetime': '40.723434448242187', 'passenger_count': '1', 'payment_type': '.80', 'store_and_fwd_flag': '5', 'pickup_latitude': '0.5', 'trip_type ': '0.5', 'lpep_dropoff_datetime': '1.25', 'ratecodeid': '0', 'trip_distance': '', 'fare_amount': '0.3', 'pickup_longitude': '7.55', 'dropoff_latitude': '1', 'tolls_amount': '1', 'improvement_surcharge': '', 'tip_amount': '', none: none, 'ehail_fee': none, 'mta_tax': none, 'extra': none, 'dropoff_longitude': none} traceback (most recent call last): file "/users/paulaceccon/documents/projetos/nycnoise/scripts/noiseinference.py", line 490, in <module> taxi_dropoffs = gettaxitrips(date) file "/users/paulaceccon/documents/projetos/nycnoise/scripts/noiseinference.py", line 300, in gettaxitrips {'vendorid': '2', 'total_amount': '2015-06-01 00:00:16', 'lpep_pickup_datetime': '2015-06-01 00:10:29', 'passenger_count': 'n', 'payment_type': '1', 'store_and_fwd_flag': '-73.939163208007812', 'pickup_latitude': '40.816555023193359', 'trip_type ': '-73.938468933105469', 'lpep_dropoff_datetime': '40.796218872070313', 'ratecodeid': '1', 'trip_distance': '1.94', 'fare_amount': '9.5', 'pickup_longitude': '0.5', 'dropoff_latitude': '0.5', 'tolls_amount': '0', 'improvement_surcharge': '0', 'tip_amount': '', none: [''], 'ehail_fee': '1', 'mta_tax': '2', 'extra': '10.8', 'dropoff_longitude': ''} result = pool.map(consumetaxidata, data) file "/usr/local/cellar/python/2.7.11/frameworks/python.framework/versions/2.7/lib/python2.7/multiprocessing/pool.py", line 251, in map return self.map_async(func, iterable, chunksize).get() file "/usr/local/cellar/python/2.7.11/frameworks/python.framework/versions/2.7/lib/python2.7/multiprocessing/pool.py", line 567, in raise self._value valueerror: time data '40.796218872070313' not match format '%y-%m-%d %h:%m:%s'
i'm wondering what's causing this, why got messed @ 5th dict print line, , how solve it.
you should check
from dateutil.parser import parse def is_date(string): try: parse(string) return true except valueerror: return false
then
if is_date(time) , latitude not none , longitude not none: if datetime.strptime(time, '%y-%m-%d %h:%m:%s') >= datetime.strptime(date, '%y-%m-%d'): time = roundtime(datetime.strptime(time, '%y-%m-%d %h:%m:%s'), roundto=60 * 60).hour points.append((longitude, latitude, time))
it solve error
valueerror: time data '40.796218872070313' not match format '%y-%m-%d %h:%m:%s'
Comments
Post a Comment