Initial commit.

This commit is contained in:
Markus Birth 2018-01-14 03:23:42 +01:00
commit 4505fba099
Signed by: mbirth
GPG Key ID: A9928D7A098C3A9A
14 changed files with 623 additions and 0 deletions

6
.gitignore vendored Normal file
View File

@ -0,0 +1,6 @@
/data/*
!/data/.gitignore
*.pyc
/car2go.db3
/car2go.db3-shm
/car2go.db3-wal

83
README.md Normal file
View File

@ -0,0 +1,83 @@
Car Sharing Analyser
====================
This is a collection of scripts and tools to gather, prepare and analyse data about a
German car sharing company.
Gathering
---------
The script `getData.sh` collects a JSON dump from the webpage every 30 seconds and stores
that into the `data/` folder. A week's worth of data is about 3.3 GiB.
Edit the file first to configure your desired city. Then let it run in a tmux session.
Preparing
---------
`init_db.sh` will create a SQLite database file according to `sql/dbschema.sql`.
Make sure you have Python 3 installed. Install required Python packages by running:
sudo -H pip3 install -r requirements.txt
(Or use virtualenv, pipenv, venv, etc.)
Then run `import.py` to import all the JSON dumps into the database. If you continue the data
collection, you can run `import.py` later to import the new data. Successfully imported JSON
dumps can be deleted, of course.
Importing a week's worth of data takes about 5 minutes and results in a 14.5 MiB
SQLite3 database.
Analysing
---------
Run `calc_trips.py` to analyse car's state changes and calculate possible(!) trips from it.
The trips data is written back into the database. (Trips shorter than 70 seconds are filtered.
See notes below.) A week's worth of trip data increases the database by about 4 MiB.
You can also use this script as a starting point for your own analysing scripts.
(Pull requests are welcome.)
For working with the database itself, I recommend
[DB Browser for SQLite](http://sqlitebrowser.org/).
### Example Queries
To see the history of a specific car, you can run e.g. (`XYZ` = number plate):
SELECT * FROM car_history WHERE plate="XYZ";
To see cars in a specific area, you can run:
SELECT *
FROM car_history
WHERE latitude>=52.515652 AND longitude>=13.372373
AND latitude<=52.516813 AND longitude<=13.378115;
Also check out the views in the database.
Notes
=====
* The `distance_km` is beeline from starting point to end point. If somebody runs errands and parks
in the exact same spot, the distance is (almost) 0.
* You can't distinguish between these cases with distances of (almost) zero and no petrol spent:
* a car that made a short trip and parked in the same spot,
* a car that has been taken out of service for a while,
* a car that has been reserved (possible for up to 30 minutes) but the reservation expired
* "Trips" over several days are most probably cars that have been taken out of service.
* A car that has been "reserved" (for up to 30 minutes) disappears from the list of cars. The
reservation time is included in the trip's `duration_minutes`.
* The calculated prices don't factor in additional fees (airport, drop-off), the time a car was
"reserved", and are also calculated for cars taken offline, i.e. where no money was paid by
the customer.
* Smaller negative `fuel_spent` values are probably because the car was parked on a slope.
* The id for a car is it's number plate. Theoretically, a `plate` could be put on another
vehicle (with a different `vin`). But this is very unlikely.

75
calc_trips.py Normal file
View File

@ -0,0 +1,75 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import geopy.distance
import math
import sqlite3
ANSI_UP_DEL = u"\u001b[F\u001b[K"
class C2GCalcTrips:
def __init__(self):
self.dbfile = "car2go.db3"
self.conn = sqlite3.connect(self.dbfile)
self.c = self.conn.cursor()
self.c.execute("PRAGMA foreign_keys=on;")
self.run()
def run(self):
all_cars = self.load_cars()
print("Clearing table.")
self.c.execute("DELETE FROM trips;")
print("Going to analyse {} cars.".format(len(all_cars)))
inserter = self.conn.cursor()
num_trips = 0
cars_done = 0
for carId in all_cars:
is_occupied = False
self.c.execute("SELECT stamp, occupied, fuel, latitude, longitude FROM car_state WHERE carId=? ORDER BY stamp ASC;", (carId,))
for row in self.c:
if row[1] == 1:
is_occupied = True
occu_state = row
elif row[1] == 0 and is_occupied:
is_occupied = False
seconds = row[0] - occu_state[0]
minutes = seconds / 60
if seconds <= 70:
# probably a glitch
continue
price = math.ceil(minutes) * all_cars[carId]
old_spot = (occu_state[3], occu_state[4])
new_spot = (row[3], row[4])
distance = geopy.distance.distance(old_spot, new_spot).kilometers
dataset = (
carId,
occu_state[0],
row[0],
minutes,
distance,
(occu_state[2] - row[2]),
price
)
#print(repr(dataset))
inserter.execute("INSERT INTO trips (carId, stamp_departure, stamp_arrival, duration_minutes, distance_km, fuel_spent, price) VALUES (?,?,?,?,?,?,?);", dataset)
num_trips += 1
cars_done += 1
print("{}/{} cars done. {} trips found.".format(cars_done, len(all_cars), num_trips))
if cars_done < len(all_cars):
print(ANSI_UP_DEL, end="")
print("")
self.conn.commit()
self.c.execute("VACUUM;")
self.conn.close()
def load_cars(self):
print("Loading known cars from database.")
self.c.execute("SELECT c.carId, c.plate, cm.pricePerMinute FROM cars c LEFT JOIN car_models cm ON c.vinPrefix=cm.vinPrefix;")
result = {}
for row in self.c.fetchall():
result[row[0]] = row[2]
return result
if __name__=="__main__":
C2GCalcTrips()

0
data/.gitignore vendored Normal file
View File

23
getData.sh Normal file
View File

@ -0,0 +1,23 @@
#!/bin/sh
# LOC is one of: berlin, frankfurt, hamburg, muenchen, rheinland, stuttgart
LOC="berlin"
OAUTH_CONSUMER_KEY="car2gowebsite"
FORMAT="json"
OUTPUT_DIR="data"
# URL found via browser's dev tools' "Networking" view
URL="https://www.car2go.com/api/v2.1/vehicles?loc=${LOC}&oauth_consumer_key=${OAUTH_CONSUMER_KEY}&format=${FORMAT}"
echo "URL: $URL"
while [ 1 ]; do
STAMP=`date +%Y-%m-%d_%H%M%S`
OUTPUT="$OUTPUT_DIR/$STAMP.json"
wget -q "$URL" -O "$OUTPUT"
if [ $? -eq 0 ]; then
echo -n "."
else
echo -n "!"
rm "$OUTPUT"
fi
sleep 30
done

162
import.py Normal file
View File

@ -0,0 +1,162 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import datetime
import glob
import json
import math
import os
import re
import sqlite3
ANSI_UP_DEL = u"\u001b[F\u001b[K"
class C2GImport:
def __init__(self):
self.state = {}
self.in_use = set()
self.carids = {}
self.dbfile = "car2go.db3"
self.conn = sqlite3.connect(self.dbfile)
self.conn.row_factory = sqlite3.Row
self.c = self.conn.cursor()
self.c.execute("PRAGMA foreign_keys=on;")
self.run()
def run(self):
self.load_cars()
self.load_data()
self.c.execute("SELECT MAX(stamp) FROM car_state;");
max_date_row = self.c.fetchone()
if max_date_row[0] is not None:
max_date = max_date_row[0]
max_date_dt = datetime.datetime.fromtimestamp(max_date, tz=datetime.timezone(datetime.timedelta(hours=1)))
print("Skipping already collected data until {}.".format(repr(max_date_dt)))
else:
max_date_dt = datetime.datetime.min.replace(tzinfo=datetime.timezone(datetime.timedelta(hours=1)))
ctr = 0
file_list = sorted(glob.glob(os.path.join("data/", "*.json")))
file_count = len(file_list)
for f in file_list:
stamp = re.search(r'(\d{4}-\d\d-\d\d_\d{6})', f)
stamp_str = stamp.group(1)
stamp_dt = datetime.datetime.strptime(stamp_str + "+0100", "%Y-%m-%d_%H%M%S%z")
if stamp_dt <= max_date_dt:
continue
ctr += 1
self.scanfile(f, stamp_str, stamp_dt, ctr, file_count)
if ctr%50==0:
self.conn.commit()
self.conn.commit()
self.c.execute("VACUUM;")
self.conn.close()
def load_cars(self):
print("Loading known cars from database.")
self.c.execute("SELECT carId, plate FROM cars;")
for row in self.c.fetchall():
self.carids[row[1]] = row[0]
def load_data(self):
print("Load known states from database.")
self.c.execute("SELECT c.plate, c.vin, cs.occupied, cs.address, cs.latitude, cs.longitude, cs.fuel, c.engineType, c.smartPhoneRequired, cs.interior_bad, cs.exterior_bad FROM car_state cs LEFT JOIN cars c ON cs.carId=c.carId;")
for row in self.c:
car_id = row[0]
dataset = {
"name": car_id,
"vin": row[1],
"occupied": True if row[2]==1 else False,
"address": row[3],
"coordinates": [row[5], row[4], 0],
"fuel": row[6],
"engineType": row[7],
"smartPhoneRequired": True if row[8]==1 else False,
"interior": "GOOD" if not row[9] else row[8],
"exterior": "GOOD" if not row[10] else row[9],
}
self.state[car_id] = dataset
if dataset["occupied"]:
self.in_use.add(car_id)
else:
self.in_use.discard(car_id)
def scanfile(self, filename, stamp_str, stamp_dt, file_count, files_total):
print("File: {} ({}/{})".format(filename, file_count, files_total), end="")
jf = open(filename, "r")
doc = json.load(jf)
jf.close()
root = doc["placemarks"]
print(" ({} cars)".format(len(root)), end="")
self.parse_cars(stamp_dt, root)
if file_count<files_total:
print(ANSI_UP_DEL, end="")
def parse_cars(self, stamp, placemarks):
free_cars = set()
changed_cars = set()
# Walk all unoccupied cars
for car_data in placemarks:
car_id = car_data["name"]
car_data["occupied"] = False
if self.has_changed(car_data):
changed_cars.add(car_id)
free_cars.add(car_id)
self.state[car_id] = car_data
if car_id in self.in_use:
self.in_use.remove(car_id)
# Walk all known cars
for car_id in self.state:
if not car_id in free_cars and not car_id in self.in_use:
# CAR GOT OCCUPIED
self.state[car_id]["occupied"] = True
self.in_use.add(car_id)
changed_cars.add(car_id)
for car_id in changed_cars:
# Commit changes to database
data = self.state[car_id]
if not car_id in self.carids:
self.c.execute(
"INSERT INTO cars (plate, vin, vinPrefix, smartPhoneRequired, engineType) VALUES (?,?,?,?,?)", (
car_id,
data["vin"],
data["vin"][0:9],
1 if data["smartPhoneRequired"] else 0,
data["engineType"]
))
self.carids[car_id] = self.c.lastrowid
self.c.execute(
"INSERT INTO car_state (stamp, carId, occupied, address, latitude, longitude, fuel, charging, interior_bad, exterior_bad) VALUES (?,?,?,?,?,?,?,?,?,?)", (
math.floor(stamp.timestamp()),
self.carids[car_id],
1 if data["occupied"] else 0,
data["address"],
data["coordinates"][1],
data["coordinates"][0],
data["fuel"],
None if not "charging" in data else 1 if data["charging"] else 0,
None if data["interior"] == "GOOD" else data["interior"],
None if data["exterior"] == "GOOD" else data["exterior"]
))
print(".", end="")
print(" {}".format(len(changed_cars)))
def has_changed(self, new_data):
car_id = new_data["name"]
if not car_id in self.state:
return True
old_data = self.state[car_id]
for k in new_data:
if old_data[k] != new_data[k]:
#print("{} changed {}".format(car_id, k))
return True
return False
if __name__=="__main__":
C2GImport()

3
init_db.sh Normal file
View File

@ -0,0 +1,3 @@
#!/bin/sh
#rm car2go.db3
sqlite3 car2go.db3 < sql/dbschema.sql

130
old/analyse.py Normal file
View File

@ -0,0 +1,130 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import datetime
import geopy.distance
import json
import math
import os
import glob
import re
import sys
class C2GAnalyse:
def __init__(self):
self.state = {}
self.in_use = []
self.total_minutes = 0.0
self.total_money = 0.0
self.total_distance = 0.0
self.run()
def find_car_type(self, vin):
# 1..3 = manufacturer code
# 4..6 = general type
# 7 = body form
# 8..9 = engine type
# 10 = steering (1=left, 2=right)
# 11 = manufacturing facilty
seg = vin[0:9]
typelist = {
"WDD117342": ["CLA 180", 0.34],
"WDC156912": ["GLA 180 CDI", 0.34],
"WDC156942": ["GLA 180", 0.34],
"WDC156943": ["GLA 200", 0.34],
"WDD176012": ["A 180 CDI", 0.31],
"WDD176042": ["A 180", 0.31],
"WDD176043": ["A 200", 0.31],
"WDD246242": ["B 180", 0.34],
"WME451334": ["smart fortwo 451", 0.26],
"WME451390": ["smart fortwo 451 ed", 0.29],
"WME451391": ["smart fortwo 451 ed", 0.29],
"WME453342": ["smart fortwo 453", 0.26],
}
if not seg in typelist:
print("Missing vehicle type for {} (VIN {})!".format(seg, vin))
return typelist[seg]
def run(self):
for f in sorted(glob.glob(os.path.join("../data/", "*.json"))):
self.scanfile(f)
print("{} cars seen in total.".format(len(self.state)))
print("Total kilometres driven: {}".format(self.total_distance))
print("Total minutes driven: {}".format(self.total_minutes))
print("Total money made: {}".format(self.total_money))
def scanfile(self, filename):
stamp = re.search(r'(\d{4}-\d\d-\d\d_\d{6})', filename)
stamp_str = stamp.group(1)
stamp_dt = datetime.datetime.strptime(stamp_str, "%Y-%m-%d_%H%M%S")
print("File: {} ({}, Totals: {:.3f}km, {:.1f}mins, {:.2f}€)".format(filename, stamp_str, self.total_distance, self.total_minutes, self.total_money), end="")
jf = open(filename, "r")
doc = json.load(jf)
jf.close()
root = doc["placemarks"]
print(" ({} cars available)".format(len(root)))
self.parse_cars(stamp_dt, root)
def parse_cars(self, stamp, placemarks):
not_in_use = []
for pm in placemarks:
self.update_car(stamp, pm)
not_in_use.append(pm["name"])
for car_id in self.state:
if not car_id in not_in_use:
if not car_id in self.in_use:
print("{} OCCUPIED!".format(car_id))
self.state[car_id]["occupied"] = stamp
self.in_use.append(car_id)
else:
if car_id in self.in_use:
#print("{} RETURNED!".format(car_id))
if "occupied" in self.state[car_id]:
del self.state[car_id]["occupied"]
self.in_use.remove(car_id)
def update_car(self, stamp, new_data):
car_id = new_data["name"]
if car_id in self.state:
old_data = self.state[car_id]
car_type = self.find_car_type(new_data["vin"])
if new_data["address"] != old_data["address"]:
## CAR MOVED
old_spot = (old_data["coordinates"][1], old_data["coordinates"][0])
new_spot = (new_data["coordinates"][1], new_data["coordinates"][0])
distance = geopy.distance.distance(old_spot, new_spot).kilometers
if not "occupied" in old_data:
old_data["occupied"] = "unknown"
minutes = -1
else:
minutes = (stamp - old_data["occupied"]).total_seconds() / 60
self.total_minutes += minutes
self.total_money += math.ceil(minutes) * car_type[1]
self.total_distance += distance
print("{} {} moved for {:.3f}km/{:.1f}mins: {} ({}%, {}, {}) {} -> {} {} ({}%, {}, {})".format(
car_type[0],
car_id,
distance,
minutes,
old_data["address"],
old_data["fuel"],
old_data["interior"],
old_data["exterior"],
old_data["occupied"],
stamp,
new_data["address"],
new_data["fuel"],
new_data["interior"],
new_data["exterior"]
))
elif new_data["fuel"] != old_data["fuel"]:
print("{} FUEL CHANGE: {} -> {}".format(car_id, old_data["fuel"], new_data["fuel"]))
if new_data["vin"] != old_data["vin"]:
print("{} ################# VIN CHANGE!!!".format(car_id))
self.state[car_id] = new_data
if __name__=="__main__":
C2GAnalyse()

1
requirements.txt Normal file
View File

@ -0,0 +1 @@
geopy

119
sql/dbschema.sql Normal file
View File

@ -0,0 +1,119 @@
PRAGMA journal_mode=WAL;
PRAGMA foreign_keys=on;
-- VIN digits:
-- 1..3 = manufacturer code
-- 4..6 = general type
-- 7 = body form
-- 8..9 = engine type
-- 10 = steering (1=left, 2=right)
-- 11 = manufacturing facilty
CREATE TABLE "car_models" (
"vinPrefix" TEXT PRIMARY KEY,
"model" TEXT NOT NULL,
"subtype" TEXT,
"pricePerMinute" REAL
);
INSERT INTO "car_models" ("vinPrefix", "model", "subtype", "pricePerMinute") VALUES
("WDD117342", "CLA", "180", 0.34),
("WDC156912", "GLA", "180 CDI", 0.34),
("WDC156942", "GLA", "180", 0.34),
("WDC156943", "GLA", "200", 0.34),
("WDD176012", "A", "180 CDI", 0.31),
("WDD176042", "A", "180", 0.31),
("WDD176043", "A", "200", 0.31),
("WDD242890", "B", "ed", 0.34),
("WDD246242", "B", "180", 0.34),
("WME451334", "smart", "fortwo 451 coupé mhd", 0.26),
("WME451390", "smart", "fortwo 451 coupé ed", 0.29),
("WME451391", "smart", "fortwo 451 coupé ed", 0.29),
("WME451392", "smart", "fortwo 451 coupé ed (Brabus)", 0.29),
("WME453391", "smart", "fortwo 453 coupé ed", 0.29),
("WME453342", "smart", "fortwo 453 coupé", 0.26)
;
CREATE TABLE "cars" (
"carId" INTEGER PRIMARY KEY AUTOINCREMENT,
"plate" TEXT NOT NULL,
"vin" TEXT,
"vinPrefix" TEXT REFERENCES "car_models" ("vinPrefix"),
"smartPhoneRequired" INTEGER,
"engineType" TEXT
);
-- Plates can't have different VINs and vice versa
CREATE UNIQUE INDEX "idx_cars" ON "cars" ("plate", "vin");
CREATE VIEW "cars_detailed" AS
SELECT c.carId, c.plate, cm.model, cm.subtype, cm.pricePerMinute, c.vin, c.smartPhoneRequired, c.engineType
FROM "cars" c
LEFT JOIN "car_models" cm ON c.vinPrefix=cm.vinPrefix;
-- TEST
CREATE VIEW "cars_detailed2" AS
SELECT c.carId, c.plate, cm.model, cm.subtype, cm.pricePerMinute, c.vin, c.smartPhoneRequired, c.engineType
FROM "cars" c
LEFT JOIN "car_models" cm ON substr(c.vin, 1, 9)=cm.vinPrefix;
CREATE TABLE "car_state" (
"stamp" INTEGER NOT NULL,
"carId" INTEGER NOT NULL REFERENCES "cars" ("carId"),
"occupied" INTEGER,
"address" TEXT,
"latitude" REAL,
"longitude" REAL,
"fuel" INTEGER,
"charging" INTEGER,
"interior_bad" TEXT,
"exterior_bad" TEXT
);
-- A car can only have one state at a time
CREATE UNIQUE INDEX "idx_car_state" ON "car_state" ("stamp", "carId");
CREATE VIEW "car_state_human" AS
SELECT datetime(stamp, "unixepoch", "localtime") AS "datetime", * FROM "car_state" ORDER BY stamp ASC;
CREATE VIEW "car_statechanges_count" AS
SELECT cs.carId, ca.plate, cm.model, cm.subtype, COUNT(*) AS stateChanges
FROM "car_state" cs
LEFT JOIN "cars" ca ON cs.carId=ca.carId
LEFT JOIN "car_models" cm ON ca.vinPrefix=cm.vinPrefix
GROUP BY cs.carId
ORDER BY stateChanges DESC;
-- To be filtered by carId or plate
CREATE VIEW "car_history" AS
SELECT cs."datetime", cs.carId, c.plate, cs.occupied, cs.address, cs.latitude, cs.longitude, cs.fuel, cs.charging, c.vin, c.engineType, cs.interior_bad, cs.exterior_bad, c.smartPhoneRequired
FROM "car_state_human" cs
LEFT JOIN "cars" c ON cs.carId=c.carId
ORDER BY stamp ASC;
CREATE TABLE "trips" (
carId INTEGER REFERENCES "cars" ("carId"),
stamp_departure INTEGER,
stamp_arrival INTEGER,
duration_minutes REAL,
distance_km REAL,
fuel_spent INTEGER,
price REAL,
FOREIGN KEY ("carId", "stamp_departure") REFERENCES "car_state" ("carId", "stamp"),
FOREIGN KEY ("carId", "stamp_arrival") REFERENCES "car_state" ("carId", "stamp")
);
CREATE VIEW "trips_details" AS
SELECT
cd.plate, cd.model, cd.subtype, cs1.address AS "address_departure", cs2.address AS "address_arrival",
datetime(t.stamp_departure, "unixepoch", "localtime") AS "time_departure",
datetime(t.stamp_arrival, "unixepoch", "localtime") AS "time_arrival",
t.duration_minutes, t.distance_km, t.fuel_spent, t.price,
t.stamp_departure, cs1.latitude AS "latitude_departure", cs1.longitude AS "longitude_departure",
t.stamp_arrival, cs2.latitude AS "latitude_arrival", cs2.longitude AS "longitude_arrival",
cs1.fuel AS "fuel_departure", cs2.fuel AS "fuel_arrival",
cd.vin, cd.pricePerMinute, cd.engineType
FROM trips t
LEFT JOIN car_state cs1 ON t.carId=cs1.carId AND t.stamp_departure=cs1.stamp
LEFT JOIN car_state cs2 ON t.carId=cs2.carId AND t.stamp_arrival=cs2.stamp
LEFT JOIN cars_detailed cd ON t.carId=cd.carId
ORDER BY t.stamp_departure ASC;

6
sql/geocoder-fail.sql Normal file
View File

@ -0,0 +1,6 @@
-- These coordinates have multiple addresses?
SELECT * FROM car_state WHERE latitude=52.50205 AND longitude=13.44281;
SELECT * FROM car_state WHERE latitude=52.50366 AND longitude=13.2961;
SELECT * FROM car_state WHERE latitude=52.51389 AND longitude=13.19664;
SELECT * FROM car_state WHERE latitude=52.52896 AND longitude=13.41283;
SELECT * FROM car_state WHERE latitude=52.52984 AND longitude=13.41373;

View File

@ -0,0 +1,4 @@
SELECT *
FROM car_history
WHERE latitude>=52.515652 AND longitude>=13.372373
AND latitude<=52.516813 AND longitude<=13.378115;

View File

@ -0,0 +1,3 @@
SELECT *
FROM car_history
WHERE plate="B-GO3781";

8
testquery.sh Normal file
View File

@ -0,0 +1,8 @@
#!/bin/sh
for f in sql/testquery*.sql; do
echo "################"
echo "### $f"
echo "################"
sqlite3 -header car2go.db3 < $f
#sqlite3 -header -line car2go.db3 < $f
done