Source code for tacoma.data_io

"""
This module provides routines to load and write
temporal networks to the `taco`-fileformat.
"""
from __future__ import print_function

import os
import json
import gzip  # for sociopatterns data
import csv

import wget
import shutil

import tacoma as tc
from tacoma import _get_raw_temporal_network


[docs]def mkdirp_customdir(directory='~/.tacoma/'): """simulate `mkdir -p` functionality""" directory = os.path.abspath(os.path.expanduser(directory)) if not os.path.exists(directory): os.makedirs(directory)
[docs]def reset_web_directory(): """Reset the internal system directory `~/.tacoma/`""" directory = '~/.tacoma/web/' directory = os.path.abspath(os.path.expanduser(directory)) shutil.rmtree(directory)
[docs]def write_json_taco(temporal_network, fp): """Writes a temporal network to a .taco-file (which is actually in json format). Parameters ---------- temporal_network : :mod:`edge_changes`, :mod:`edge_lists`, :mod:`edge_changes_with_histograms`, or :mod:`edge_lists_with_histograms` An instance of a temporal network. fp : file-like or :obj:`str` write to this file """ temporal_network = _get_raw_temporal_network(temporal_network) this_data = {} if type(temporal_network) == tc.edge_changes: this_data['type'] = 'edge_changes' this_data['t'] = temporal_network.t this_data['t0'] = temporal_network.t0 this_data['tmax'] = temporal_network.tmax this_data['N'] = temporal_network.N this_data['edges_initial'] = temporal_network.edges_initial this_data['edges_in'] = temporal_network.edges_in this_data['edges_out'] = temporal_network.edges_out this_data['int_to_node'] = temporal_network.int_to_node this_data['notes'] = temporal_network.notes this_data['time_unit'] = temporal_network.time_unit elif type(temporal_network) == tc.edge_lists: this_data['type'] = 'edge_lists' this_data['t'] = temporal_network.t this_data['tmax'] = temporal_network.tmax this_data['N'] = temporal_network.N this_data['edges'] = temporal_network.edges this_data['int_to_node'] = temporal_network.int_to_node this_data['notes'] = temporal_network.notes this_data['time_unit'] = temporal_network.time_unit else: raise ValueError('Unknown temporal network format: ' + str(type(temporal_network))) if isinstance(fp, str): fp = os.path.abspath(os.path.expanduser(fp)) fp = open(fp, 'w') json.dump(this_data, fp, separators=(',', ':')) fp.close()
[docs]def load_json_taco(fp): """ Loads a temporal network from a .taco-file (which is actually in json format). Parameters ---------- fp : file-like or :obj:`str` read from this file Returns ------- temporal network type as given in the .taco-file """ file_is_string = isinstance(fp, str) if file_is_string: fp = os.path.abspath(os.path.expanduser(fp)) with open(fp, 'r') as f: this_data = json.load(f) else: this_data = json.load(fp) if this_data['type'] == 'edge_changes': temporal_network = tc.edge_changes() temporal_network.t = this_data['t'] temporal_network.t0 = this_data['t0'] temporal_network.tmax = this_data['tmax'] temporal_network.N = this_data['N'] temporal_network.edges_initial = this_data['edges_initial'] temporal_network.edges_in = this_data['edges_in'] temporal_network.edges_out = this_data['edges_out'] temporal_network.int_to_node = { int(i): s for i, s in this_data['int_to_node'].items()} temporal_network.notes = this_data['notes'] temporal_network.time_unit = this_data['time_unit'] elif this_data['type'] == 'edge_lists': temporal_network = tc.edge_lists() temporal_network.t = this_data['t'] temporal_network.tmax = this_data['tmax'] temporal_network.N = this_data['N'] temporal_network.edges = this_data['edges'] temporal_network.int_to_node = { int(i): s for i, s in this_data['int_to_node'].items()} temporal_network.notes = this_data['notes'] temporal_network.time_unit = this_data['time_unit'] else: raise ValueError( 'file is corrupted, unknown temporal network format: ' + this_data['type']) return temporal_network
[docs]def read_json_taco(fp): """Loads a temporal network from a .taco-file (which is actually in json format) by simply calling :mod:`load_json_taco` because I'm too stupid to remember if it's actually 'read' or 'load' smh. Parameters ---------- fp : file-like or :obj:`str` read from this file Returns ------- temporal_network : `edge_lists` or `edge_changes` type as given in the .taco-file """ return load_json_taco(fp)
[docs]def download_and_convert_sociopatterns_hypertext_2009(url="http://www.sociopatterns.org/files/datasets/003/ht09_contact_list.dat.gz", filename="~/.tacoma/ht09.taco", ): """Download the SocioPatterns 'Hypertext 2009 dynamic contact network' data, extract it and save it as taco. This data is actually binned in intervals of `[t-20s, t]`. Parameters ---------- url : :obj:`str`, optional The url from which the tsv-data should be retrieved filename : :obj:`str`, optional this is the path where the taco will be saved to. default : "~/.tacoma/ht09.taco" Returns ------- edge_lists : :mod:`edge_lists` The temporal network of the 'Hypertext 2009 dynamic contact network'. Notes ----- If you use this data, please cite :: L. Isella et al., What's in a crowd? Analysis of face-to-face behavioral networks, Journal of Theoretical Biology 271, 166 (2011). """ # get directory name for download directory, _ = os.path.split(os.path.abspath(os.path.expanduser(filename))) mkdirp_customdir(directory) # download wget.download(url, out=directory) # open gzipped file gzip_file = os.path.join(directory, 'ht09_contact_list.dat.gz') with gzip.open(gzip_file, mode='rt') as f: reader = csv.reader(f, delimiter='\t') # mappings of nodes to integers node_to_int = {} int_to_node = {} # get an initial t_old # (this is done to detect changes in the tsv t_old = None # list of edge lists edges = [] # time points time = [] for row in reader: # this is to account for the interval choice [t-20s, t] t = float(int(row[0]) - 20) # if the time changed, we save the new time and # prepare to save new edges if t_old != t: if (t_old is not None) and (t - t_old > 20): edges.append([]) time.append(t_old+20) edges.append([]) time.append(t) # get the edge i = int(row[1]) j = int(row[2]) # map the edge to integers if i not in node_to_int: this_int = len(node_to_int) node_to_int[i] = len(node_to_int) int_to_node[this_int] = str(i) if j not in node_to_int: this_int = len(node_to_int) node_to_int[j] = len(node_to_int) int_to_node[this_int] = str(j) # save the edge edges[-1].append(tuple(sorted([ node_to_int[i], node_to_int[j] ]))) t_old = t N = len(node_to_int) tmax = time[-1] + 20.0 # get a new `edge_lists` instance el = tc.edge_lists() el.N = N el.tmax = tmax el.edges = edges el.t = time el.time_unit = 's' el.notes = """ This data is binned. In this data, t0 = 0.0 corresponds to 8am on Jun 29th 2009 (UNIX time 1246255200). For more info, please visit http://www.sociopatterns.org/datasets/hypertext-2009-dynamic-contact-network/ . If you use this data, please cite L. Isella et al., What's in a crowd? Analysis of face-to-face behavioral networks, Journal of Theoretical Biology 271, 166 (2011). """ el.int_to_node = int_to_node # verifying that this is a valid temporal network tc.verify(el) # save this edge_lists instance with open(os.path.abspath(os.path.expanduser(filename)), 'w') as f: write_json_taco(el, f) # remove the downloaded gzipped file os.remove(gzip_file) return el
[docs]def download_and_convert_sociopatterns_high_school_2013(url="http://www.sociopatterns.org/wp-content/uploads/2015/07/High-School_data_2013.csv.gz", filename="~/.tacoma/hs13.taco", ): """Download the SocioPatterns 'High school 2013 dynamic contact network' data, extract it and save it as taco. This data is actually binned in intervals of `[t-20s, t]`. Parameters ---------- url : :obj:`str`, optional The url from which the tsv-data should be retrieved filename : :obj:`str`, optional this is the path where the taco will be saved to. default : "~/.tacoma/hs13.taco" Returns ------- edge_lists : :mod:`edge_lists` The temporal network of the 'High school 2013 dynamic contact network'. Notes ----- If you use this data, please cite :: [HS13] R. Mastrandrea, J. Fournet, A. Barrat, Contact patterns in a high school: a comparison between data collected using wearable sensors, contact diaries and friendship surveys. PLoS ONE 10(9): e0136497 (2015) """ # get directory name for download directory, _ = os.path.split(os.path.abspath(os.path.expanduser(filename))) mkdirp_customdir(directory) # download wget.download(url, out=directory) # open gzipped file gzip_file = os.path.join(directory, 'High-School_data_2013.csv.gz') with gzip.open(gzip_file, mode='rt') as f: reader = csv.reader(f, delimiter=' ') # mappings of nodes to integers node_to_int = {} int_to_node = {} # get an initial t_old # (this is done to detect changes in the tsv t_old = None # list of edge lists edges = [] # time points time = [] count = 0 for row in reader: if count == 0: t0 = int(row[0]) - 20 # this is to account for the interval choice [t-20s, t] t = float(int(row[0]) - 20 - t0) # if the time changed, we save the new time and # prepare to save new edges if t_old != t: if (t_old is not None) and (t - t_old > 20): edges.append([]) time.append(t_old+20) edges.append([]) time.append(t) # get the edge i = int(row[1]) j = int(row[2]) # map the edge to integers if i not in node_to_int: this_int = len(node_to_int) node_to_int[i] = len(node_to_int) int_to_node[this_int] = str(i) if j not in node_to_int: this_int = len(node_to_int) node_to_int[j] = len(node_to_int) int_to_node[this_int] = str(j) # save the edge edges[-1].append(tuple(sorted([ node_to_int[i], node_to_int[j] ]))) t_old = t count += 1 N = len(node_to_int) tmax = time[-1] + 20.0 # get a new `edge_lists` instance el = tc.edge_lists() el.N = N el.tmax = tmax el.edges = edges el.t = time el.time_unit = 's' el.notes = """ This data is binned. In this data, t0 = 0.0 corresponds to UNIX time """ + str(t0) + """. For more info, please visit http://www.sociopatterns.org/datasets/high-school-contact-and-friendship-networks/ . If you use this data, please cite R. Mastrandrea, J. Fournet, A. Barrat, Contact patterns in a high school: a comparison between data collected using wearable sensors, contact diaries and friendship surveys. PLoS ONE 10(9): e0136497 (2015) """ el.int_to_node = int_to_node # verifying that this is a valid temporal network tc.verify(el) # save this edge_lists instance with open(os.path.abspath(os.path.expanduser(filename)), 'w') as f: write_json_taco(el, f) # remove the downloaded gzipped file os.remove(gzip_file) return el
[docs]def write_fwP_args(args, filename): """Dump Flockwork-P arguments to a json-file""" filename = os.path.abspath(os.path.expanduser(filename)) with open(filename, 'w') as f: json.dump(args, f)
[docs]def load_fwP_args(filename): """Load Flockwork-P arguments from a json-file""" filename = os.path.abspath(os.path.expanduser(filename)) with open(filename, 'r') as f: args = json.load(f) return args
[docs]def load_sociopatterns_hypertext_2009(filename="~/.tacoma/ht09.taco"): """Once :func:`tacoma.data_io.download_sociopatterns_hypertext_2009` was called, use this function to retrieve an :mod:`edge_lists` instance of the conference data set 'Hypertext 2009 dynamic contact network' (from the SocioPatterns project). Parameters ---------- filename : :obj:`str`, optional this is the path where the taco was saved to. default : "~/.tacoma/ht09.taco" Returns ------- edge_lists : :mod:`edge_lists` The temporal network of the 'Hypertext 2009 dynamic contact network'. If you use this data, please cite :: L. Isella et al., What's in a crowd? Analysis of face-to-face behavioral networks, Journal of Theoretical Biology 271, 166 (2011). """ filename = os.path.abspath(os.path.expanduser(filename)) if not os.path.exists(filename): raise ValueError( "File "+filename+" does not exist. Have you called `tacoma.download_and_convert_sociopatterns_hypertext_2009()` before?") return load_json_taco(filename)
[docs]def load_sociopatterns_high_school_2013(filename="~/.tacoma/hs13.taco"): """Once :func:`tacoma.data_io.download_sociopatterns_high_school_2013` was called, use this function to retrieve an :mod:`edge_lists` instance of the conference data set 'High school 2013 dynamic contact network' (from the SocioPatterns project). Parameters ---------- filename : :obj:`str`, optional this is the path where the taco was saved to. default : "~/.tacoma/hs13.taco" Returns ------- edge_lists : :mod:`edge_lists` The temporal network of the 'High school 2013 dynamic contact network'. Notes ----- If you use this data, please cite :: R. Mastrandrea, J. Fournet, A. Barrat, Contact patterns in a high school: a comparison between data collected using wearable sensors, contact diaries and friendship surveys. PLoS ONE 10(9): e0136497 (2015) """ filename = os.path.abspath(os.path.expanduser(filename)) if not os.path.exists(filename): raise ValueError( "File "+filename+" does not exist. Have you called tacoma.download_and_convert_sociopatterns_high_school_2013() before?") return load_json_taco(filename)
[docs]def write_edge_trajectory_coordinates(temporal_network, filename, filter_for_duration=0.0): """Write the coordinates of the edge activation periods to a json-file such that each entry corresponds to a line to be drawn. Parameters ---------- temporal_network : :mod:`edge_changes`, :mod:`edge_lists`, :mod:`edge_changes_with_histograms`, or :mod:`edge_lists_with_histograms` An instance of a temporal network. filename : :obj:`str` Write to this file. """ traj = tc.get_edge_trajectories(temporal_network) try: t0 = temporal_network.t[0] except: t0 = temporal_network.t0 tmax = temporal_network.tmax coords = [] for i_edge, entry in enumerate(traj): for time_pair in entry.time_pairs: t_i = time_pair[0] t_f = time_pair[1] if t_f - t_i > filter_for_duration: coords.append((i_edge, t_i, t_f)) data = { 'xlim': (t0, tmax), 'ylim': (0, len(traj)), 'data': coords, } filename = os.path.abspath(os.path.expanduser(filename)) with open(filename, 'w') as f: json.dump(data, f)
[docs]def load_json_dict(fn): """Load a dictionary from a JSON-file""" with open(fn, 'r') as f: this_dict = json.load(f) return this_dict
if __name__ == "__main__": el = download_and_convert_sociopatterns_hypertext_2009() print(el.N) print(el.tmax) print(el.t[:10]) print(el.edges[:10]) print(el.notes) print(el.time_unit)