From 4505fba0996614e8e0e468d39d3980d04b3e54ed Mon Sep 17 00:00:00 2001 From: Markus Birth Date: Sun, 14 Jan 2018 03:23:42 +0100 Subject: [PATCH] Initial commit. --- .gitignore | 6 ++ README.md | 83 +++++++++++++++ calc_trips.py | 75 ++++++++++++++ data/.gitignore | 0 getData.sh | 23 +++++ import.py | 162 ++++++++++++++++++++++++++++++ init_db.sh | 3 + old/analyse.py | 130 ++++++++++++++++++++++++ requirements.txt | 1 + sql/dbschema.sql | 119 ++++++++++++++++++++++ sql/geocoder-fail.sql | 6 ++ sql/testquery-brandenburggate.sql | 4 + sql/testquery-history.sql | 3 + testquery.sh | 8 ++ 14 files changed, 623 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 calc_trips.py create mode 100644 data/.gitignore create mode 100644 getData.sh create mode 100644 import.py create mode 100644 init_db.sh create mode 100644 old/analyse.py create mode 100644 requirements.txt create mode 100644 sql/dbschema.sql create mode 100644 sql/geocoder-fail.sql create mode 100644 sql/testquery-brandenburggate.sql create mode 100644 sql/testquery-history.sql create mode 100644 testquery.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..eea8a83 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +/data/* +!/data/.gitignore +*.pyc +/car2go.db3 +/car2go.db3-shm +/car2go.db3-wal diff --git a/README.md b/README.md new file mode 100644 index 0000000..bece95d --- /dev/null +++ b/README.md @@ -0,0 +1,83 @@ +Car Sharing Analyser +==================== + +This is a collection of scripts and tools to gather, prepare and analyse data about a +German car sharing company. + + +Gathering +--------- + +The script `getData.sh` collects a JSON dump from the webpage every 30 seconds and stores +that into the `data/` folder. A week's worth of data is about 3.3 GiB. + +Edit the file first to configure your desired city. Then let it run in a tmux session. + + +Preparing +--------- + +`init_db.sh` will create a SQLite database file according to `sql/dbschema.sql`. + +Make sure you have Python 3 installed. Install required Python packages by running: + + sudo -H pip3 install -r requirements.txt + +(Or use virtualenv, pipenv, venv, etc.) + +Then run `import.py` to import all the JSON dumps into the database. If you continue the data +collection, you can run `import.py` later to import the new data. Successfully imported JSON +dumps can be deleted, of course. + +Importing a week's worth of data takes about 5 minutes and results in a 14.5 MiB +SQLite3 database. + + +Analysing +--------- + +Run `calc_trips.py` to analyse car's state changes and calculate possible(!) trips from it. +The trips data is written back into the database. (Trips shorter than 70 seconds are filtered. +See notes below.) A week's worth of trip data increases the database by about 4 MiB. + +You can also use this script as a starting point for your own analysing scripts. +(Pull requests are welcome.) + +For working with the database itself, I recommend +[DB Browser for SQLite](http://sqlitebrowser.org/). + + +### Example Queries + +To see the history of a specific car, you can run e.g. (`XYZ` = number plate): + + SELECT * FROM car_history WHERE plate="XYZ"; + +To see cars in a specific area, you can run: + + SELECT * + FROM car_history + WHERE latitude>=52.515652 AND longitude>=13.372373 + AND latitude<=52.516813 AND longitude<=13.378115; + +Also check out the views in the database. + + +Notes +===== + +* The `distance_km` is beeline from starting point to end point. If somebody runs errands and parks + in the exact same spot, the distance is (almost) 0. +* You can't distinguish between these cases with distances of (almost) zero and no petrol spent: + * a car that made a short trip and parked in the same spot, + * a car that has been taken out of service for a while, + * a car that has been reserved (possible for up to 30 minutes) but the reservation expired +* "Trips" over several days are most probably cars that have been taken out of service. +* A car that has been "reserved" (for up to 30 minutes) disappears from the list of cars. The + reservation time is included in the trip's `duration_minutes`. +* The calculated prices don't factor in additional fees (airport, drop-off), the time a car was + "reserved", and are also calculated for cars taken offline, i.e. where no money was paid by + the customer. +* Smaller negative `fuel_spent` values are probably because the car was parked on a slope. +* The id for a car is it's number plate. Theoretically, a `plate` could be put on another + vehicle (with a different `vin`). But this is very unlikely. diff --git a/calc_trips.py b/calc_trips.py new file mode 100644 index 0000000..650c28f --- /dev/null +++ b/calc_trips.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import geopy.distance +import math +import sqlite3 + +ANSI_UP_DEL = u"\u001b[F\u001b[K" + +class C2GCalcTrips: + def __init__(self): + self.dbfile = "car2go.db3" + self.conn = sqlite3.connect(self.dbfile) + self.c = self.conn.cursor() + self.c.execute("PRAGMA foreign_keys=on;") + self.run() + + def run(self): + all_cars = self.load_cars() + print("Clearing table.") + self.c.execute("DELETE FROM trips;") + print("Going to analyse {} cars.".format(len(all_cars))) + inserter = self.conn.cursor() + num_trips = 0 + cars_done = 0 + for carId in all_cars: + is_occupied = False + self.c.execute("SELECT stamp, occupied, fuel, latitude, longitude FROM car_state WHERE carId=? ORDER BY stamp ASC;", (carId,)) + for row in self.c: + if row[1] == 1: + is_occupied = True + occu_state = row + elif row[1] == 0 and is_occupied: + is_occupied = False + seconds = row[0] - occu_state[0] + minutes = seconds / 60 + if seconds <= 70: + # probably a glitch + continue + price = math.ceil(minutes) * all_cars[carId] + old_spot = (occu_state[3], occu_state[4]) + new_spot = (row[3], row[4]) + distance = geopy.distance.distance(old_spot, new_spot).kilometers + dataset = ( + carId, + occu_state[0], + row[0], + minutes, + distance, + (occu_state[2] - row[2]), + price + ) + #print(repr(dataset)) + inserter.execute("INSERT INTO trips (carId, stamp_departure, stamp_arrival, duration_minutes, distance_km, fuel_spent, price) VALUES (?,?,?,?,?,?,?);", dataset) + num_trips += 1 + cars_done += 1 + print("{}/{} cars done. {} trips found.".format(cars_done, len(all_cars), num_trips)) + if cars_done < len(all_cars): + print(ANSI_UP_DEL, end="") + print("") + + self.conn.commit() + self.c.execute("VACUUM;") + self.conn.close() + + def load_cars(self): + print("Loading known cars from database.") + self.c.execute("SELECT c.carId, c.plate, cm.pricePerMinute FROM cars c LEFT JOIN car_models cm ON c.vinPrefix=cm.vinPrefix;") + result = {} + for row in self.c.fetchall(): + result[row[0]] = row[2] + return result + +if __name__=="__main__": + C2GCalcTrips() diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/getData.sh b/getData.sh new file mode 100644 index 0000000..cf73a3d --- /dev/null +++ b/getData.sh @@ -0,0 +1,23 @@ +#!/bin/sh +# LOC is one of: berlin, frankfurt, hamburg, muenchen, rheinland, stuttgart +LOC="berlin" +OAUTH_CONSUMER_KEY="car2gowebsite" +FORMAT="json" +OUTPUT_DIR="data" + +# URL found via browser's dev tools' "Networking" view +URL="https://www.car2go.com/api/v2.1/vehicles?loc=${LOC}&oauth_consumer_key=${OAUTH_CONSUMER_KEY}&format=${FORMAT}" +echo "URL: $URL" + +while [ 1 ]; do + STAMP=`date +%Y-%m-%d_%H%M%S` + OUTPUT="$OUTPUT_DIR/$STAMP.json" + wget -q "$URL" -O "$OUTPUT" + if [ $? -eq 0 ]; then + echo -n "." + else + echo -n "!" + rm "$OUTPUT" + fi + sleep 30 +done diff --git a/import.py b/import.py new file mode 100644 index 0000000..93974dd --- /dev/null +++ b/import.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import datetime +import glob +import json +import math +import os +import re +import sqlite3 + +ANSI_UP_DEL = u"\u001b[F\u001b[K" + +class C2GImport: + def __init__(self): + self.state = {} + self.in_use = set() + self.carids = {} + self.dbfile = "car2go.db3" + self.conn = sqlite3.connect(self.dbfile) + self.conn.row_factory = sqlite3.Row + self.c = self.conn.cursor() + self.c.execute("PRAGMA foreign_keys=on;") + self.run() + + def run(self): + self.load_cars() + self.load_data() + self.c.execute("SELECT MAX(stamp) FROM car_state;"); + max_date_row = self.c.fetchone() + if max_date_row[0] is not None: + max_date = max_date_row[0] + max_date_dt = datetime.datetime.fromtimestamp(max_date, tz=datetime.timezone(datetime.timedelta(hours=1))) + print("Skipping already collected data until {}.".format(repr(max_date_dt))) + else: + max_date_dt = datetime.datetime.min.replace(tzinfo=datetime.timezone(datetime.timedelta(hours=1))) + + ctr = 0 + file_list = sorted(glob.glob(os.path.join("data/", "*.json"))) + file_count = len(file_list) + for f in file_list: + stamp = re.search(r'(\d{4}-\d\d-\d\d_\d{6})', f) + stamp_str = stamp.group(1) + stamp_dt = datetime.datetime.strptime(stamp_str + "+0100", "%Y-%m-%d_%H%M%S%z") + if stamp_dt <= max_date_dt: + continue + ctr += 1 + self.scanfile(f, stamp_str, stamp_dt, ctr, file_count) + if ctr%50==0: + self.conn.commit() + self.conn.commit() + self.c.execute("VACUUM;") + self.conn.close() + + def load_cars(self): + print("Loading known cars from database.") + self.c.execute("SELECT carId, plate FROM cars;") + for row in self.c.fetchall(): + self.carids[row[1]] = row[0] + + def load_data(self): + print("Load known states from database.") + self.c.execute("SELECT c.plate, c.vin, cs.occupied, cs.address, cs.latitude, cs.longitude, cs.fuel, c.engineType, c.smartPhoneRequired, cs.interior_bad, cs.exterior_bad FROM car_state cs LEFT JOIN cars c ON cs.carId=c.carId;") + for row in self.c: + car_id = row[0] + dataset = { + "name": car_id, + "vin": row[1], + "occupied": True if row[2]==1 else False, + "address": row[3], + "coordinates": [row[5], row[4], 0], + "fuel": row[6], + "engineType": row[7], + "smartPhoneRequired": True if row[8]==1 else False, + "interior": "GOOD" if not row[9] else row[8], + "exterior": "GOOD" if not row[10] else row[9], + } + self.state[car_id] = dataset + if dataset["occupied"]: + self.in_use.add(car_id) + else: + self.in_use.discard(car_id) + + def scanfile(self, filename, stamp_str, stamp_dt, file_count, files_total): + print("File: {} ({}/{})".format(filename, file_count, files_total), end="") + jf = open(filename, "r") + doc = json.load(jf) + jf.close() + root = doc["placemarks"] + print(" ({} cars)".format(len(root)), end="") + self.parse_cars(stamp_dt, root) + if file_count {} {} ({}%, {}, {})".format( + car_type[0], + car_id, + distance, + minutes, + old_data["address"], + old_data["fuel"], + old_data["interior"], + old_data["exterior"], + old_data["occupied"], + stamp, + new_data["address"], + new_data["fuel"], + new_data["interior"], + new_data["exterior"] + )) + elif new_data["fuel"] != old_data["fuel"]: + print("{} FUEL CHANGE: {} -> {}".format(car_id, old_data["fuel"], new_data["fuel"])) + if new_data["vin"] != old_data["vin"]: + print("{} ################# VIN CHANGE!!!".format(car_id)) + self.state[car_id] = new_data + +if __name__=="__main__": + C2GAnalyse() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ac07547 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +geopy diff --git a/sql/dbschema.sql b/sql/dbschema.sql new file mode 100644 index 0000000..4e5f903 --- /dev/null +++ b/sql/dbschema.sql @@ -0,0 +1,119 @@ +PRAGMA journal_mode=WAL; +PRAGMA foreign_keys=on; + +-- VIN digits: +-- 1..3 = manufacturer code +-- 4..6 = general type +-- 7 = body form +-- 8..9 = engine type +-- 10 = steering (1=left, 2=right) +-- 11 = manufacturing facilty +CREATE TABLE "car_models" ( + "vinPrefix" TEXT PRIMARY KEY, + "model" TEXT NOT NULL, + "subtype" TEXT, + "pricePerMinute" REAL +); + +INSERT INTO "car_models" ("vinPrefix", "model", "subtype", "pricePerMinute") VALUES + ("WDD117342", "CLA", "180", 0.34), + ("WDC156912", "GLA", "180 CDI", 0.34), + ("WDC156942", "GLA", "180", 0.34), + ("WDC156943", "GLA", "200", 0.34), + ("WDD176012", "A", "180 CDI", 0.31), + ("WDD176042", "A", "180", 0.31), + ("WDD176043", "A", "200", 0.31), + ("WDD242890", "B", "ed", 0.34), + ("WDD246242", "B", "180", 0.34), + ("WME451334", "smart", "fortwo 451 coupé mhd", 0.26), + ("WME451390", "smart", "fortwo 451 coupé ed", 0.29), + ("WME451391", "smart", "fortwo 451 coupé ed", 0.29), + ("WME451392", "smart", "fortwo 451 coupé ed (Brabus)", 0.29), + ("WME453391", "smart", "fortwo 453 coupé ed", 0.29), + ("WME453342", "smart", "fortwo 453 coupé", 0.26) +; + +CREATE TABLE "cars" ( + "carId" INTEGER PRIMARY KEY AUTOINCREMENT, + "plate" TEXT NOT NULL, + "vin" TEXT, + "vinPrefix" TEXT REFERENCES "car_models" ("vinPrefix"), + "smartPhoneRequired" INTEGER, + "engineType" TEXT +); + +-- Plates can't have different VINs and vice versa +CREATE UNIQUE INDEX "idx_cars" ON "cars" ("plate", "vin"); + +CREATE VIEW "cars_detailed" AS + SELECT c.carId, c.plate, cm.model, cm.subtype, cm.pricePerMinute, c.vin, c.smartPhoneRequired, c.engineType + FROM "cars" c + LEFT JOIN "car_models" cm ON c.vinPrefix=cm.vinPrefix; + +-- TEST +CREATE VIEW "cars_detailed2" AS + SELECT c.carId, c.plate, cm.model, cm.subtype, cm.pricePerMinute, c.vin, c.smartPhoneRequired, c.engineType + FROM "cars" c + LEFT JOIN "car_models" cm ON substr(c.vin, 1, 9)=cm.vinPrefix; + +CREATE TABLE "car_state" ( + "stamp" INTEGER NOT NULL, + "carId" INTEGER NOT NULL REFERENCES "cars" ("carId"), + "occupied" INTEGER, + "address" TEXT, + "latitude" REAL, + "longitude" REAL, + "fuel" INTEGER, + "charging" INTEGER, + "interior_bad" TEXT, + "exterior_bad" TEXT +); + +-- A car can only have one state at a time +CREATE UNIQUE INDEX "idx_car_state" ON "car_state" ("stamp", "carId"); + +CREATE VIEW "car_state_human" AS + SELECT datetime(stamp, "unixepoch", "localtime") AS "datetime", * FROM "car_state" ORDER BY stamp ASC; + +CREATE VIEW "car_statechanges_count" AS + SELECT cs.carId, ca.plate, cm.model, cm.subtype, COUNT(*) AS stateChanges + FROM "car_state" cs + LEFT JOIN "cars" ca ON cs.carId=ca.carId + LEFT JOIN "car_models" cm ON ca.vinPrefix=cm.vinPrefix + GROUP BY cs.carId + ORDER BY stateChanges DESC; + +-- To be filtered by carId or plate +CREATE VIEW "car_history" AS + SELECT cs."datetime", cs.carId, c.plate, cs.occupied, cs.address, cs.latitude, cs.longitude, cs.fuel, cs.charging, c.vin, c.engineType, cs.interior_bad, cs.exterior_bad, c.smartPhoneRequired + FROM "car_state_human" cs + LEFT JOIN "cars" c ON cs.carId=c.carId + ORDER BY stamp ASC; + +CREATE TABLE "trips" ( + carId INTEGER REFERENCES "cars" ("carId"), + stamp_departure INTEGER, + stamp_arrival INTEGER, + duration_minutes REAL, + distance_km REAL, + fuel_spent INTEGER, + price REAL, + FOREIGN KEY ("carId", "stamp_departure") REFERENCES "car_state" ("carId", "stamp"), + FOREIGN KEY ("carId", "stamp_arrival") REFERENCES "car_state" ("carId", "stamp") +); + +CREATE VIEW "trips_details" AS + SELECT + cd.plate, cd.model, cd.subtype, cs1.address AS "address_departure", cs2.address AS "address_arrival", + datetime(t.stamp_departure, "unixepoch", "localtime") AS "time_departure", + datetime(t.stamp_arrival, "unixepoch", "localtime") AS "time_arrival", + t.duration_minutes, t.distance_km, t.fuel_spent, t.price, + t.stamp_departure, cs1.latitude AS "latitude_departure", cs1.longitude AS "longitude_departure", + t.stamp_arrival, cs2.latitude AS "latitude_arrival", cs2.longitude AS "longitude_arrival", + cs1.fuel AS "fuel_departure", cs2.fuel AS "fuel_arrival", + cd.vin, cd.pricePerMinute, cd.engineType + FROM trips t + LEFT JOIN car_state cs1 ON t.carId=cs1.carId AND t.stamp_departure=cs1.stamp + LEFT JOIN car_state cs2 ON t.carId=cs2.carId AND t.stamp_arrival=cs2.stamp + LEFT JOIN cars_detailed cd ON t.carId=cd.carId + ORDER BY t.stamp_departure ASC; diff --git a/sql/geocoder-fail.sql b/sql/geocoder-fail.sql new file mode 100644 index 0000000..0d3ba8e --- /dev/null +++ b/sql/geocoder-fail.sql @@ -0,0 +1,6 @@ +-- These coordinates have multiple addresses? +SELECT * FROM car_state WHERE latitude=52.50205 AND longitude=13.44281; +SELECT * FROM car_state WHERE latitude=52.50366 AND longitude=13.2961; +SELECT * FROM car_state WHERE latitude=52.51389 AND longitude=13.19664; +SELECT * FROM car_state WHERE latitude=52.52896 AND longitude=13.41283; +SELECT * FROM car_state WHERE latitude=52.52984 AND longitude=13.41373; diff --git a/sql/testquery-brandenburggate.sql b/sql/testquery-brandenburggate.sql new file mode 100644 index 0000000..1b7e179 --- /dev/null +++ b/sql/testquery-brandenburggate.sql @@ -0,0 +1,4 @@ +SELECT * +FROM car_history +WHERE latitude>=52.515652 AND longitude>=13.372373 +AND latitude<=52.516813 AND longitude<=13.378115; diff --git a/sql/testquery-history.sql b/sql/testquery-history.sql new file mode 100644 index 0000000..130276d --- /dev/null +++ b/sql/testquery-history.sql @@ -0,0 +1,3 @@ +SELECT * +FROM car_history +WHERE plate="B-GO3781"; diff --git a/testquery.sh b/testquery.sh new file mode 100644 index 0000000..cd67f15 --- /dev/null +++ b/testquery.sh @@ -0,0 +1,8 @@ +#!/bin/sh +for f in sql/testquery*.sql; do + echo "################" + echo "### $f" + echo "################" + sqlite3 -header car2go.db3 < $f + #sqlite3 -header -line car2go.db3 < $f +done