From 7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 10 Mar 2020 12:27:53 +0000 Subject: Initial commit --- datasets/shanghai_license/.gitignore | 1 + datasets/shanghai_license/README.md | 29 + .../Shanghai_license_plate_price_-_Sheet3.csv | 205 +++++++ datasets/shanghai_license/convert.py | 89 +++ datasets/shanghai_license/shanghai_license.json | 637 +++++++++++++++++++++ datasets/shanghai_license/shanghai_license.png | Bin 0 -> 21139 bytes 6 files changed, 961 insertions(+) create mode 100644 datasets/shanghai_license/.gitignore create mode 100644 datasets/shanghai_license/README.md create mode 100644 datasets/shanghai_license/Shanghai_license_plate_price_-_Sheet3.csv create mode 100644 datasets/shanghai_license/convert.py create mode 100644 datasets/shanghai_license/shanghai_license.json create mode 100644 datasets/shanghai_license/shanghai_license.png (limited to 'datasets/shanghai_license') diff --git a/datasets/shanghai_license/.gitignore b/datasets/shanghai_license/.gitignore new file mode 100644 index 0000000..a52cbb0 --- /dev/null +++ b/datasets/shanghai_license/.gitignore @@ -0,0 +1 @@ +old/ diff --git a/datasets/shanghai_license/README.md b/datasets/shanghai_license/README.md new file mode 100644 index 0000000..f4c7026 --- /dev/null +++ b/datasets/shanghai_license/README.md @@ -0,0 +1,29 @@ +# Shanghai License Plate Applicants + +Source: +[Kaggle](https://www.kaggle.com/bogof666/shanghai-car-license-plate-auction-price). +Data licensed under [CC0: Public +Domain](https://creativecommons.org/publicdomain/zero/1.0/), so we can +redistribute it as part of this repository. + +There seems to be a clear sudden growth in the number of applicants. + +Note: according to [this discussion on +Kaggle](https://www.kaggle.com/bogof666/shanghai-car-license-plate-auction-price/discussion/73140), +the record for 2008-02 is missing because the license plates for January and +Feburary were auctioned off simultaneously in January. As this represents an +uneven measurement and a missing value, we choose to split the observation for +January and February 2008 in two, dividing the amount equally between the +months. An alternative would be to introduce a missing value in 2008-02, but +since many of the algorithms we wish to evaluate are not able to handle +missing values (and any imputation method would be incorrect), we believe this +is a reasonable way to deal with this issue. + +To obtain the ``shanghai_license.json`` file from the +``Shanghai_license_plate_price_-_Sheet3.csv`` file, simply run: + +``` +$ python convert.py Shanghai_license_plate_price_-_Sheet3.csv shanghai_license.json +``` + +![Plot of shanghai_license dataset](./shanghai_license.png) diff --git a/datasets/shanghai_license/Shanghai_license_plate_price_-_Sheet3.csv b/datasets/shanghai_license/Shanghai_license_plate_price_-_Sheet3.csv new file mode 100644 index 0000000..18863d7 --- /dev/null +++ b/datasets/shanghai_license/Shanghai_license_plate_price_-_Sheet3.csv @@ -0,0 +1,205 @@ +Date,Total number of license issued,lowest price ,avg price,Total number of applicants +Jan-02,1400,13600,14735,3718 +Feb-02,1800,13100,14057,4590 +Mar-02,2000,14300,14662,5190 +Apr-02,2300,16000,16334,4806 +May-02,2350,17800,18357,4665 +Jun-02,2800,19600,20178,4502 +Jul-02,3000,19800,20904,3774 +Aug-02,3000,21000,21601,4640 +Sep-02,3200,23600,24040,4393 +Oct-02,3200,26400,27040,4661 +Nov-02,3200,30800,31721,4021 +Dec-02,3600,27800,27848,3525 +Jan-03,3000,18800,24267,9442 +Feb-03,3000,23800,25254,12030 +Mar-03,3000,28800,29551,11219 +Apr-03,3300,34100,34845,8794 +May-03,3800,35000,36903,14634 +Jun-03,5500,36100,37667,15507 +Jul-03,6000,36900,38269,11929 +Aug-03,4500,38500,39369,9315 +Sep-03,6650,28800,38728,8532 +Oct-03,4500,32800,34842,9383 +Nov-03,5042,33100,34284,9849 +Dec-03,4776,37100,38054,10491 +Jan-04,5000,38000,39516,8663 +Feb-04,4800,39600,40053,10156 +Mar-04,4800,43000,43333,9950 +Apr-04,5500,44200,45492,8150 +May-04,6527,10800,34226,8114 +Jun-04,6233,17800,21001,19233 +Jul-04,6600,21800,23544,14464 +Aug-04,6800,25100,25991,15506 +Sep-04,6640,29300,30033,10634 +Oct-04,6600,28000,29768,9519 +Nov-04,6600,26000,27620,9188 +Dec-04,5500,29300,30282,9005 +Jan-05,5500,28500,32520,6208 +Feb-05,3800,31700,32425,8949 +Mar-05,4000,34300,34684,9117 +Apr-05,5000,36800,37355,8113 +May-05,5833,35000,35661,9673 +Jun-05,5690,37000,37479,8409 +Jul-05,6326,37900,38378,8777 +Aug-05,6829,25000,35905,7520 +Sep-05,6700,26500,28927,10972 +Oct-05,6000,25200,26385,11167 +Nov-05,5700,29800,30320,13633 +Dec-05,5700,35200,36749,8351 +Jan-06,5000,26900,31220,5907 +Feb-06,3800,34200,34887,12367 +Mar-06,4500,38500,38932,8904 +Apr-06,5000,37500,38326,7888 +May-06,4500,37700,38139,8301 +Jun-06,4500,39500,39752,8478 +Jul-06,5500,39600,39966,8966 +Aug-06,6200,39900,40459,9190 +Sep-06,6500,37000,41601,7064 +Oct-06,6500,36300,37899,11237 +Nov-06,6000,37800,38460,110234 +Dec-06,6500,39800,40518,9477 +Jan-07,6000,38500,40974,6587 +Feb-07,3500,39100,40473,5056 +Mar-07,4000,41100,41573,10168 +Apr-07,5500,43300,43623,10523 +May-07,6000,44500,44853,10273 +Jun-07,6000,47200,47711,11478 +Jul-07,5500,45200,46581,10327 +Aug-07,8000,46500,46897,12943 +Sep-07,8500,48600,49631,10561 +Oct-07,7500,50500,51000,10715 +Nov-07,7500,53800,54317,10596 +Dec-07,7500,50000,56042,10356 +Jan-08,16000,8100,23370,20539 +Mar-08,9300,31300,32169,63534 +Apr-08,9000,37300,37659,37072 +May-08,8200,34400,36047,26341 +Jun-08,7700,33900,34947,21208 +Jul-08,6800,33800,34491,16783 +Aug-08,6000,35900,36460,13451 +Sep-08,6500,29300,31788,11002 +Oct-08,5000,32600,33224,11882 +Nov-08,5500,21800,24351,10170 +Dec-08,4500,31000,31665,16801 +Jan-09,5200,28600,29399,16544 +Feb-09,5200,33000,33394,16848 +Mar-09,6000,26600,27552,18575 +Apr-09,6500,28300,28724,17654 +May-09,7200,28500,29100,16471 +Jun-09,8000,30000,30363,17433 +Jul-09,8000,32100,32522,17220 +Aug-09,8000,36000,36231,18750 +Sep-09,8500,27200,29500,14906 +Oct-09,8000,33900,34402,22006 +Nov-09,8000,34900,35317,21902 +Dec-09,8000,36900,37593,18577 +Jan-10,8000,37800,38311,18975 +Feb-10,7500,38300,38620,18810 +Mar-10,8000,39600,39882,17704 +Apr-10,8500,41000,41637,17313 +May-10,8500,41900,42262,16324 +Jun-10,9200,39200,40380,16252 +Jul-10,9000,38400,39362,13389 +Aug-10,9000,39800,40169,16855 +Sep-10,9000,41800,42180,15198 +Oct-10,9000,43000,43271,14941 +Nov-10,8500,44900,45291,13429 +Dec-10,9000,10400,15970,11224 +Jan-11,8000,38300,38771,30675 +Feb-11,7500,44200,44627,25104 +Mar-11,8000,46200,46657,25014 +Apr-11,8000,46300,47399,22326 +May-11,9000,47400,47700,25708 +Jun-11,9000,48500,48855,22474 +Jul-11,9000,50900,51174,21852 +Aug-11,9000,51000,52228,21544 +Sep-11,9500,52200,52622,22268 +Oct-11,9000,53800,54008,19415 +Nov-11,9000,45700,47635,20050 +Dec-11,8500,51000,51437,26531 +Jan-12,8000,52800,53195,24354 +Feb-12,8000,55400,55632,23391 +Mar-12,8000,58300,58625,24897 +Apr-12,8500,61000,61626,22706 +May-12,9300,64000,64367,24230 +Jun-12,9500,55800,58227,24774 +Jul-12,9500,57700,58271,26526 +Aug-12,9500,62100,62559,21425 +Sep-12,9500,65700,66425,19114 +Oct-12,9500,65200,66708,19921 +Nov-12,9500,66400,66946,19120 +Dec-12,9500,68900,69346,18244 +Jan-13,9000,75000,75332,20857 +Feb-13,9000,83300,83571,24651 +Mar-13,9000,90800,91898,23589 +Apr-13,11000,83900,84101,26174 +May-13,9000,80700,80803,22224 +Jun-13,9000,77600,77823,21482 +Jul-13,9000,76300,76465,21811 +Aug-13,9000,74700,74939,22650 +Sep-13,9000,73400,73492,35154 +Oct-13,10000,82300,83723,28887 +Nov-13,8500,75500,75717,38220 +Dec-13,8500,76000,76093,39625 +Jan-14,8100,73500,73501,41946 +Feb-14,7400,73200,73357,45758 +Mar-14,7400,73800,73872,61853 +Apr-14,8200,74000,74113,94241 +May-14,7400,74400,74503,114121 +Jun-14,7400,73800,73896,135677 +Jul-14,7400,74600,74680,136098 +Aug-14,7400,73600,73785,121550 +Sep-14,8300,73800,73875,122219 +Oct-14,7400,74000,74075,105532 +Nov-14,7400,73500,73633,95595 +Dec-14,7447,73600,73687,96972 +Jan-15,7990,74000,74216,98203 +Feb-15,7653,76500,76618,103224 +Mar-15,7406,74600,74830,132690 +Apr-15,8288,80600,80759,152298 +May-15,7482,79000,79099,156007 +Jun-15,7441,80000,80020,172205 +Jul-15,7531,83100,83171,166302 +Aug-15,7454,82600,82642,166939 +Sep-15,8727,82100,82172,165765 +Oct-15,7763,85300,85424,170995 +Nov-15,7514,84600,84703,169159 +Dec-15,7698,84500,84572,179133 +Jan-16,9409,82200,82352,187533 +Feb-16,8363,83200,83244,196470 +Mar-16,8310,83100,83148,221109 +Apr-16,11829,85100,85127,256897 +May-16,11598,85000,85058,277889 +Jun-16,11546,84400,84483,275438 +Jul-16,11475,87200,87235,240750 +Aug-16,11549,86900,86946,251188 +Sep-16,12889,86500,86523,229544 +Oct-16,11621,88300,88359,213212 +Nov-16,11549,88600,88665,215424 +Dec-16,12261,88300,88412,219882 +Jan-17,12215,87600,87685,232101 +Feb-17,10157,88200,88240,251717 +Mar-17,10356,87800,87916,262010 +Apr-17,12196,89800,89850,252273 +May-17,10316,90100,90209,270197 +Jun-17,10312,89400,89532,244349 +Jul-17,10325,92200,92250,269189 +Aug-17,10558,91600,91629,256083 +Sep-17,12413,91300,91415,250566 +Oct-17,11388,93500,93540,244868 +Nov-17,11002,93100,93130,226911 +Dec-17,12147,92800,92848,228148 +Jan-18,12183,87900,87936,226316 +Feb-18,11098,87600,87660,220831 +Mar-18,9855,88100,88176,217056 +Apr-18,11916,86900,87089,204980 +May-18,10216,89000,89018,198627 +Jun-18,10775,87800,87900,209672 +Jul-18,10395,88300,88380,202337 +Aug-18,10402,88300,88365,192755 +Sep-18,12712,87300,87410,189142 +Oct-18,10728,88000,88070,181861 +Nov-18,11766,87300,87374,177355 +Dec-18,12850,87400,87508,165442 +Jan-19,12832,89500,89565,168614 diff --git a/datasets/shanghai_license/convert.py b/datasets/shanghai_license/convert.py new file mode 100644 index 0000000..b29395a --- /dev/null +++ b/datasets/shanghai_license/convert.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Dataset conversion script + +Author: Gertjan van den Burg + +""" + +import json +import argparse +import clevercsv + + +def reformat_time(mmmyy): + """ From MMM-YY to %Y-%m """ + MONTHS = { + "Jan": 1, + "Feb": 2, + "Mar": 3, + "Apr": 4, + "May": 5, + "Jun": 6, + "Jul": 7, + "Aug": 8, + "Sep": 9, + "Oct": 10, + "Nov": 11, + "Dec": 12, + } + mmm, yy = mmmyy.split("-") + Y = int(yy) + 2000 + m = MONTHS.get(mmm) + return "%i-%02i" % (Y, m) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("input_file", help="File to convert") + parser.add_argument("output_file", help="File to write to") + return parser.parse_args() + + +def main(): + args = parse_args() + + with open(args.input_file, "r", newline="", encoding="ascii") as fp: + reader = clevercsv.reader( + fp, delimiter=",", quotechar="", escapechar="" + ) + rows = list(reader) + + rows.pop(0) + + time = [reformat_time(r[0]) for r in rows] + values = [int(r[-1]) for r in rows] + + # Manually split Jan-08 into two, see readme for details. + jan08idx = time.index("2008-01") + values[jan08idx] /= 2 + time.insert(jan08idx + 1, "2008-02") + values.insert(jan08idx + 1, values[jan08idx]) + + name = "shanghai_license" + longname = "Shanghai License" + time_fmt = "%Y-%m" + series = [{"label": "No. of Applicants", "type": "int", "raw": values}] + + data = { + "name": name, + "longname": longname, + "n_obs": len(time), + "n_dim": len(series), + "time": { + "type": "string", + "format": time_fmt, + "index": list(range(len(time))), + "raw": time, + }, + "series": series, + } + + with open(args.output_file, "w") as fp: + json.dump(data, fp, indent="\t") + + +if __name__ == "__main__": + main() diff --git a/datasets/shanghai_license/shanghai_license.json b/datasets/shanghai_license/shanghai_license.json new file mode 100644 index 0000000..5a2aa7d --- /dev/null +++ b/datasets/shanghai_license/shanghai_license.json @@ -0,0 +1,637 @@ +{ + "name": "shanghai_license", + "longname": "Shanghai License", + "n_obs": 205, + "n_dim": 1, + "time": { + "type": "string", + "format": "%Y-%m", + "index": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204 + ], + "raw": [ + "2002-01", + "2002-02", + "2002-03", + "2002-04", + "2002-05", + "2002-06", + "2002-07", + "2002-08", + "2002-09", + "2002-10", + "2002-11", + "2002-12", + "2003-01", + "2003-02", + "2003-03", + "2003-04", + "2003-05", + "2003-06", + "2003-07", + "2003-08", + "2003-09", + "2003-10", + "2003-11", + "2003-12", + "2004-01", + "2004-02", + "2004-03", + "2004-04", + "2004-05", + "2004-06", + "2004-07", + "2004-08", + "2004-09", + "2004-10", + "2004-11", + "2004-12", + "2005-01", + "2005-02", + "2005-03", + "2005-04", + "2005-05", + "2005-06", + "2005-07", + "2005-08", + "2005-09", + "2005-10", + "2005-11", + "2005-12", + "2006-01", + "2006-02", + "2006-03", + "2006-04", + "2006-05", + "2006-06", + "2006-07", + "2006-08", + "2006-09", + "2006-10", + "2006-11", + "2006-12", + "2007-01", + "2007-02", + "2007-03", + "2007-04", + "2007-05", + "2007-06", + "2007-07", + "2007-08", + "2007-09", + "2007-10", + "2007-11", + "2007-12", + "2008-01", + "2008-02", + "2008-03", + "2008-04", + "2008-05", + "2008-06", + "2008-07", + "2008-08", + "2008-09", + "2008-10", + "2008-11", + "2008-12", + "2009-01", + "2009-02", + "2009-03", + "2009-04", + "2009-05", + "2009-06", + "2009-07", + "2009-08", + "2009-09", + "2009-10", + "2009-11", + "2009-12", + "2010-01", + "2010-02", + "2010-03", + "2010-04", + "2010-05", + "2010-06", + "2010-07", + "2010-08", + "2010-09", + "2010-10", + "2010-11", + "2010-12", + "2011-01", + "2011-02", + "2011-03", + "2011-04", + "2011-05", + "2011-06", + "2011-07", + "2011-08", + "2011-09", + "2011-10", + "2011-11", + "2011-12", + "2012-01", + "2012-02", + "2012-03", + "2012-04", + "2012-05", + "2012-06", + "2012-07", + "2012-08", + "2012-09", + "2012-10", + "2012-11", + "2012-12", + "2013-01", + "2013-02", + "2013-03", + "2013-04", + "2013-05", + "2013-06", + "2013-07", + "2013-08", + "2013-09", + "2013-10", + "2013-11", + "2013-12", + "2014-01", + "2014-02", + "2014-03", + "2014-04", + "2014-05", + "2014-06", + "2014-07", + "2014-08", + "2014-09", + "2014-10", + "2014-11", + "2014-12", + "2015-01", + "2015-02", + "2015-03", + "2015-04", + "2015-05", + "2015-06", + "2015-07", + "2015-08", + "2015-09", + "2015-10", + "2015-11", + "2015-12", + "2016-01", + "2016-02", + "2016-03", + "2016-04", + "2016-05", + "2016-06", + "2016-07", + "2016-08", + "2016-09", + "2016-10", + "2016-11", + "2016-12", + "2017-01", + "2017-02", + "2017-03", + "2017-04", + "2017-05", + "2017-06", + "2017-07", + "2017-08", + "2017-09", + "2017-10", + "2017-11", + "2017-12", + "2018-01", + "2018-02", + "2018-03", + "2018-04", + "2018-05", + "2018-06", + "2018-07", + "2018-08", + "2018-09", + "2018-10", + "2018-11", + "2018-12", + "2019-01" + ] + }, + "series": [ + { + "label": "No. of Applicants", + "type": "int", + "raw": [ + 3718, + 4590, + 5190, + 4806, + 4665, + 4502, + 3774, + 4640, + 4393, + 4661, + 4021, + 3525, + 9442, + 12030, + 11219, + 8794, + 14634, + 15507, + 11929, + 9315, + 8532, + 9383, + 9849, + 10491, + 8663, + 10156, + 9950, + 8150, + 8114, + 19233, + 14464, + 15506, + 10634, + 9519, + 9188, + 9005, + 6208, + 8949, + 9117, + 8113, + 9673, + 8409, + 8777, + 7520, + 10972, + 11167, + 13633, + 8351, + 5907, + 12367, + 8904, + 7888, + 8301, + 8478, + 8966, + 9190, + 7064, + 11237, + 110234, + 9477, + 6587, + 5056, + 10168, + 10523, + 10273, + 11478, + 10327, + 12943, + 10561, + 10715, + 10596, + 10356, + 10269.5, + 10269.5, + 63534, + 37072, + 26341, + 21208, + 16783, + 13451, + 11002, + 11882, + 10170, + 16801, + 16544, + 16848, + 18575, + 17654, + 16471, + 17433, + 17220, + 18750, + 14906, + 22006, + 21902, + 18577, + 18975, + 18810, + 17704, + 17313, + 16324, + 16252, + 13389, + 16855, + 15198, + 14941, + 13429, + 11224, + 30675, + 25104, + 25014, + 22326, + 25708, + 22474, + 21852, + 21544, + 22268, + 19415, + 20050, + 26531, + 24354, + 23391, + 24897, + 22706, + 24230, + 24774, + 26526, + 21425, + 19114, + 19921, + 19120, + 18244, + 20857, + 24651, + 23589, + 26174, + 22224, + 21482, + 21811, + 22650, + 35154, + 28887, + 38220, + 39625, + 41946, + 45758, + 61853, + 94241, + 114121, + 135677, + 136098, + 121550, + 122219, + 105532, + 95595, + 96972, + 98203, + 103224, + 132690, + 152298, + 156007, + 172205, + 166302, + 166939, + 165765, + 170995, + 169159, + 179133, + 187533, + 196470, + 221109, + 256897, + 277889, + 275438, + 240750, + 251188, + 229544, + 213212, + 215424, + 219882, + 232101, + 251717, + 262010, + 252273, + 270197, + 244349, + 269189, + 256083, + 250566, + 244868, + 226911, + 228148, + 226316, + 220831, + 217056, + 204980, + 198627, + 209672, + 202337, + 192755, + 189142, + 181861, + 177355, + 165442, + 168614 + ] + } + ] +} \ No newline at end of file diff --git a/datasets/shanghai_license/shanghai_license.png b/datasets/shanghai_license/shanghai_license.png new file mode 100644 index 0000000..574fc24 Binary files /dev/null and b/datasets/shanghai_license/shanghai_license.png differ -- cgit v1.2.3