Source code for eurybia.data.data_loader

"""
Data loader module
"""
import json
import os
from urllib.request import urlretrieve

import pandas as pd


[docs]def data_loading(dataset): """ data_loading allows Eurybia user to try the library with small but clear datasets. Titanic, house_prices or us_car_accident data. Example ---------- >>> from eurybia.data.data_loader import data_loading >>> house_df, house_dict = data_loading('house_prices') Parameters ---------- dataset : String Dataset's name to return. - 'titanic' - 'house_prices' - 'us_car_accident' Returns ------- data : pandas.DataFrame Dataset required dict : (Dictionnary, Optional) If exist, columns labels dictionnary associated to the dataset. """ current_path = os.path.dirname(os.path.abspath(__file__)) if dataset == "house_prices": if os.path.isfile(current_path + "/house_prices_dataset.csv") is False: github_data_url = "https://github.com/MAIF/eurybia/raw/master/eurybia/data/" urlretrieve( github_data_url + "house_prices_dataset.csv", filename=current_path + "/house_prices_dataset.csv" ) urlretrieve( github_data_url + "house_prices_labels.json", filename=current_path + "/house_prices_labels.json" ) data_house_prices_path = os.path.join(current_path, "house_prices_dataset.csv") dict_house_prices_path = os.path.join(current_path, "house_prices_labels.json") data = pd.read_csv(data_house_prices_path, header=0, index_col=0, engine="python") with open(dict_house_prices_path) as openfile2: dic = json.load(openfile2) return data, dic elif dataset == "titanic": if os.path.isfile(current_path + "/titanicdata.csv") is False: github_data_url = "https://github.com/MAIF/eurybia/raw/master/eurybia/data/" urlretrieve(github_data_url + "titanicdata.csv", filename=current_path + "/titanicdata.csv") data_titanic_path = os.path.join(current_path, "titanicdata.csv") data = pd.read_csv(data_titanic_path, header=0, index_col=0, engine="python") return data elif dataset == "us_car_accident": if os.path.isfile(current_path + "/US_Accidents_extract.csv") is False: github_data_url = "https://github.com/MAIF/eurybia/raw/master/eurybia/data/" urlretrieve( github_data_url + "US_Accidents_extract.csv", filename=current_path + "/US_Accidents_extract.csv" ) data_us_car_path = os.path.join(current_path, "US_Accidents_extract.csv") data = pd.read_csv(data_us_car_path, engine="python") return data else: raise ValueError("Dataset not found. Check the docstring for available values")