修訂. | d93d2d4c5a8dc3b9544bbf0cbaf4ebdf93db0457 |
---|---|
大小 | 12,495 bytes |
時間 | 2010-01-12 08:02:47 |
作者 | lorenzo |
Log Message | Some minor modifications, mainly taking into account the need to use
|
#!/usr/bin/env python
import scipy as s
import numpy as n
import pylab as p
import sys
import string
def duplicate_and_mix_array(my_2d_arr):
#this function is useful to cast the time-dependent edge list
#into a shape more similar to the bootcount list.
new_arr=s.zeros(2*2*len(my_2d_arr)).reshape((2*len(my_2d_arr),2))
new_arr=new_arr.astype("int64")
sel_even=s.arange(0,len(new_arr),2)
sel_odd=s.arange(1,len(new_arr),2)
new_arr[sel_odd,0]=my_2d_arr[:,0]
new_arr[sel_even,0]=my_2d_arr[:,0]
new_arr[sel_odd,1]=my_2d_arr[:,2]
new_arr[sel_even,1]=my_2d_arr[:,1]
return (new_arr)
def remove_equal_lines(A):
d = {}
for r in A:
t = tuple(r)
d[t] = d.get(t,0) + 1
# The dict d now has the counts of the unique rows of A.
B = s.array(d.keys()) # The unique rows of A
C = s.array(d.values()) # The counts of the unique rows
return [B,C]
# def select_entrance_single_tag(entrance_data):
# len_arr=len(entrance_data)
# entrance_list=s.zeros(len_arr).astype("int")-1 #array initialization
# ip_set=set((168624164,3232235522))
# for i in xrange(len_arr):
# my_set=set(entrance_data[i])
# if len(ip_set & my_set)>0:
# entrance_list[i]=1
# return entrance_list
def visit_duration_single_tag(my_data_arr,visitor_id, timeslice):
find_visitor=s.where(my_data_arr[:,1]==visitor_id)[0]
tag_recover= visitor_id >> 16
boot_recover = visitor_id & 0xFFFF
time_visit=my_data_arr[find_visitor,0]
# if ((time_visit[-1]-time_visit[0])<0):
# print "ordering problem"
# print "visitor id is, ", visitor_id
# print "max and min(time_visit) are, ",max(time_visit),min(time_visit)
# p.save("data_arr_sel.dat", my_data_arr, fmt="%d")
# visit_duration=max(time_visit)-min(time_visit)
#I add the timeslice just to make sure I am using the same
#definition Alain is using.
visit_duration=time_visit[-1]-time_visit[0]+timeslice
# print "visit_duaration is, ", visit_duration
return [visit_duration, visitor_id,time_visit[0],time_visit[-1],\
tag_recover, boot_recover]
def visit_duration_many_tags(my_data_arr, timeslice,min_dur, max_dur):
visitor_id_list=s.unique1d(my_data_arr[:,1])
visit_flux=s.arange(1)
visit_arr=s.arange(len(visitor_id_list))
visit_arr_extended=s.arange(6*len(visitor_id_list)).\
reshape(len(visitor_id_list),6)
my_black_overall=s.zeros(0)
for i in xrange(len(visitor_id_list)):
visitor_id=visitor_id_list[i]
visit_data=visit_duration_single_tag(my_data_arr,visitor_id, timeslice)
visit_arr[i]=visit_data[0]
# print "visit_duration_single_tag(data_arr,visitor_id) is, ", \
# visit_duration_single_tag(data_arr,visitor_id)
visit_arr_extended[i,0]=visit_data[4]
visit_arr_extended[i,1]=visit_data[5]
visit_arr_extended[i,2]=visit_data[0]
visit_arr_extended[i,3]=visit_data[2]
visit_arr_extended[i,4]=visit_data[3]
visit_arr_extended[i,5]=visit_data[1]
my_black=generate_blacklist(visitor_id, visit_data[0],min_dur, max_dur )
my_black_overall=s.hstack((my_black_overall,my_black))
visit_flux[:]=len(visit_arr)
return [visit_arr,visit_flux,visit_arr_extended, my_black_overall]
#here boot_time is simply the dummy argument of a function, but I deal
#only with contact protocols
def generate_iteration_grid(boot_time_non_unique,\
number_intervals,interval_duration,t_ini,t_end):
time_unique=s.unique1d(boot_time_non_unique)
# time_ini=time_unique[0]+ini_gap
if (t_ini>0):
time_ini=t_ini
else:
time_ini=time_unique[0]
print "time_ini is, ", time_ini
if (number_intervals<=0 and interval_duration<=0):
print "Error in calling the function"
#break
if (number_intervals>0 and interval_duration>0):
print "Error in calling the function"
#break
if (number_intervals>0 and interval_duration<=0):
if (t_end<=0):
time_grid=s.linspace(time_ini,time_unique[-1],number_intervals)
elif (t_end>0):
time_grid=s.linspace(time_ini,t_end,number_intervals)
if (number_intervals<=0 and interval_duration>0):
if (t_end<=0):
time_grid=s.arange(time_ini,time_unique[-1],interval_duration)
if (t_end>0):
time_grid=s.arange(time_ini,t_end,interval_duration)
return (time_grid.astype("int64"))
def generate_blacklist(visitor_id, visit_duration,min_dur, max_dur ):
if (visit_duration<min_dur or visit_duration>max_dur):
return (visitor_id)
else:
return (s.zeros(0).astype("int64"))
#Parameters to be used in the filtering of the visits
min_dur= 60*5 #5 min
max_dur=60*180 #180 min
min_visits_in_period=30
timeslice=20 #needed to fix the visit durations
f = open(sys.argv[1])
data_arr = [map(int, string.split(line)) for line in f.readlines()]
f.close()
data_arr = s.array(data_arr, dtype="int64")
#I should read the disambiguated network representation as an edgelist
#into data_arr
#now a test
#p.save("data_arr_rewritten.dat", data_arr, fmt="%d")
print "I finished reading the file"
boot_time_non_unique=data_arr[:,0]
t_ini=1246233600
t_end=t_ini+86400*3+1
number_intervals=-12
interval_duration= 86400 #i.e. 1 day #604800 #i.e. one week in seconds
time_grid=generate_iteration_grid(boot_time_non_unique,\
(number_intervals+1),interval_duration,t_ini,t_end)
print "len(time_grid) is, ", len(time_grid)
print "time_grid[0] and time_grid[-1] are, ", time_grid[0], time_grid[-1]
n.savetxt("time_grid.dat", time_grid, fmt="%d")
file_list=-s.ones(len(time_grid))
blacklist_file_list=-s.ones(len(time_grid))
raw_mean=-s.ones(len(time_grid))
refined_mean=-s.ones(len(time_grid))
number_visit_raw=-s.ones(len(time_grid))
number_visit_refined=-s.ones(len(time_grid))
#print "file_list is, ", file_list
for i in xrange(len(time_grid)-1):
print "i+1 is, ", i+1
time_sel=s.where((boot_time_non_unique>= time_grid[i]) & \
(boot_time_non_unique< time_grid[i+1]))[0]
data_arr_sel=data_arr[time_sel,:]
# if (i==20):
# p.save("data_arr_sel_ini.dat",data_arr_sel,fmt="%d")
# p.save("time_sel.dat",time_sel+1,fmt="%d")
#Now I need to change the shape of data_arr_sel
data_arr_sel=duplicate_and_mix_array(data_arr_sel)
# if (i==20):
# p.save("data_arr_sel_ini_reshaped.dat",data_arr_sel,fmt="%d")
#The following commented part has been changed into a comment since it
#was a mistake: getting rid of those lines was creating a mess...
#I am not even sure that the function remove_equal_lines exactly
#works here...
#Now I also need to get rid of repeated entries corresponding to
#self interactions
# data_arr_sel=remove_equal_lines(data_arr_sel)[0]
# if (i==20):
# p.save("data_arr_sel_remove_lines.dat", data_arr_sel, fmt="%d")
museum_open=1 #flag telling me whether I collected data or not in a
#certain time period
if (len(data_arr_sel)>0): #do not do anything if nobody comes to the museum!
# print "data_arr_sel is, ", data_arr_sel
all_duration_and_all_counts=visit_duration_many_tags(data_arr_sel,timeslice,min_dur, max_dur)
filename="all_visit_duration_%01d"%(i+1)
filename=filename+"_.dat"
all_durations=all_duration_and_all_counts[0]
all_durations_selected=all_durations[s.where((all_durations>=min_dur) \
& (all_durations<=max_dur) )]
number_visit_refined[i]=len(all_durations_selected)
raw_mean[i]=s.mean(all_durations)
if (len(all_durations_selected)>0): #careful! In certain days
#nothing may survive!
refined_mean[i]=s.mean(all_durations_selected)
#print "len(all_durations) is, ", len(all_durations)
n.savetxt(filename,all_durations , fmt='%d')
#Now save extended visit info like Alain does
filename="all_visit_duration_extended_%01d"%(i+1)
filename=filename+"_.dat"
all_durations_extended=all_duration_and_all_counts[2]
my_ord=s.argsort(all_durations_extended[:,0])
all_durations_extended=all_durations_extended[my_ord,:]
n.savetxt(filename,all_durations_extended , fmt='%d')
# #Now do some polishing of the extended visit information
# my_ord=s.where(all_durations_extended[:,2]>0)[0]
# all_durations_extended=all_durations_extended[my_ord]
filename="all_counts_%01d"%(i+1)
filename=filename+"_.dat"
all_counts=all_duration_and_all_counts[1]
number_visit_raw[i]=all_counts
print "all_counts is, ", all_counts
n.savetxt(filename,all_counts , fmt='%d')
filename="blacklist_%01d"%(i+1)
filename=filename+"_.dat"
blacklisted_tags=all_duration_and_all_counts[3]
if (len(blacklisted_tags)>0):
n.savetxt(filename,blacklisted_tags , fmt='%d')
blacklist_file_list[i]=i+1
file_list[i]=i+1
else:
museum_open=-2
# if (museum_open>0):
# file_list[i]=i+1
# else:
print "i+1 for an empty period is, ", i+1
print "time_grid[i] and time_grid[i+1] are, ", time_grid[i], \
time_grid[i+1]
file_list_ini=s.copy(file_list)
sel=s.where(file_list>0)
file_list=file_list[sel]
#print "file_list is, ", file_list
n.savetxt("file_list.dat", file_list, fmt='%d')
sel=s.where(blacklist_file_list>0)
blacklist_file_list=blacklist_file_list[sel]
n.savetxt("blacklist_file_list.dat", blacklist_file_list, fmt='%d')
ini_raw_mean=s.copy(raw_mean)
raw_mean=raw_mean[sel]
n.savetxt("raw_mean.dat", raw_mean, fmt='%d')
n.savetxt("raw_mean_in_minutes.dat", raw_mean/60.)
#Now this is tricky: there can be certain days when I have no visits after
#filtering!
sel=s.where(refined_mean>0)
ini_refined_mean=s.copy(refined_mean)
refined_mean=refined_mean[sel]
n.savetxt("refined_mean.dat", refined_mean , fmt='%d')
n.savetxt("refined_mean_in_minutes.dat", refined_mean/60. )
n.savetxt("refined_file_list.dat", file_list_ini[sel], fmt='%d')
# fig = p.figure()
# axes = fig.gca()
# axes.plot(file_list_ini[sel],refined_mean/60.,"ro",\
# label="filter on min/max",linewidth=2.)
# axes.plot(file_list_ini[sel], ini_raw_mean[sel]/60.,"bx",linewidth=2.,\
# label="raw data (same days)" )
# axes.legend()
# p.ylim((5,90))
# p.xlabel('Time (days)')
# p.ylabel('Mean Visit Duration')
# p.savefig("comparison_visit_duration_same_days.pdf")
# p.clf()
# fig = p.figure()
# axes = fig.gca()
# axes.plot(file_list_ini[sel],\
# abs(refined_mean/60.-ini_raw_mean[sel]/60.)/(ini_raw_mean[sel]/60.)*100,"ro"\
# ,linewidth=2.)
# p.xlabel('Time (days)')
# p.ylabel('Relative % difference filtered vs raw')
# p.savefig("relative_difference_same_days.pdf")
# p.clf()
# fig = p.figure()
# axes = fig.gca()
# axes.plot(file_list_ini[sel],refined_mean/60.,"ro",\
# label="filter on min/max",linewidth=2.)
# axes.plot(file_list_ini[sel],refined_mean/60.,"r" \
# ,linewidth=2.)
# # axes.plot(file_list, raw_mean/60.,"b",linewidth=2.,\
# # label="raw data" )
# axes.legend()
# p.xlabel('Time (days)')
# p.ylabel('Mean Visit Duration')
# p.savefig("filtered_visit_duration.pdf")
# p.clf()
#Now choose to take statistics only on days when there is a minumum of recorded
#visits
sel=s.where(number_visit_refined>=min_visits_in_period)
# fig = p.figure()
# axes = fig.gca()
# axes.plot(file_list_ini[sel],ini_refined_mean[sel]/60.,"ro",\
# label="filter on min/max",linewidth=2.)
# axes.plot(file_list_ini[sel], ini_raw_mean[sel]/60.,"bx",linewidth=2.,\
# label="raw data (same days)" )
# p.title("Select on minimum number of visits per day")
# axes.legend()
# p.ylim((5,90))
# p.xlabel('Time (days)')
# p.ylabel('Mean Visit Duration')
# p.savefig("comparison_visit_duration_same_days_and_minimum_number_visits.pdf")
# p.clf()
print "So far so good"