• R/O
  • SSH

標籤
無標籤

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

File Info

修訂. d93d2d4c5a8dc3b9544bbf0cbaf4ebdf93db0457
大小 12,495 bytes
時間 2010-01-12 08:02:47
作者 lorenzo
Log Message

Some minor modifications, mainly taking into account the need to use
weights=NA in the diameter calculation.

Content

#!/usr/bin/env python

import scipy as s
import numpy as n
import pylab as p
import sys
import string

def duplicate_and_mix_array(my_2d_arr):

    #this function is useful to cast the time-dependent edge list
    #into a shape more similar to the bootcount list.
    
    new_arr=s.zeros(2*2*len(my_2d_arr)).reshape((2*len(my_2d_arr),2))

    new_arr=new_arr.astype("int64")
    sel_even=s.arange(0,len(new_arr),2)

    sel_odd=s.arange(1,len(new_arr),2)

    new_arr[sel_odd,0]=my_2d_arr[:,0]
    new_arr[sel_even,0]=my_2d_arr[:,0]

    new_arr[sel_odd,1]=my_2d_arr[:,2]

    new_arr[sel_even,1]=my_2d_arr[:,1]

    return (new_arr)


def remove_equal_lines(A):



    d = {}
    for r in A:
       t = tuple(r)
       d[t] = d.get(t,0) + 1

    # The dict d now has the counts of the unique rows of A.

    B = s.array(d.keys())    # The unique rows of A
    C = s.array(d.values())  # The counts of the unique rows

    return [B,C]




# def select_entrance_single_tag(entrance_data):
#     len_arr=len(entrance_data)

#     entrance_list=s.zeros(len_arr).astype("int")-1 #array initialization
#     ip_set=set((168624164,3232235522))
    
#     for i in xrange(len_arr):
#         my_set=set(entrance_data[i])
#         if len(ip_set & my_set)>0:
#             entrance_list[i]=1

#     return entrance_list
        


def visit_duration_single_tag(my_data_arr,visitor_id, timeslice):

    
    
    find_visitor=s.where(my_data_arr[:,1]==visitor_id)[0]

    tag_recover= visitor_id >> 16
    boot_recover = visitor_id & 0xFFFF


    time_visit=my_data_arr[find_visitor,0]

    # if ((time_visit[-1]-time_visit[0])<0):
    #     print "ordering problem"
    #     print "visitor id is, ", visitor_id
    #     print "max and min(time_visit) are, ",max(time_visit),min(time_visit)

    #     p.save("data_arr_sel.dat", my_data_arr, fmt="%d")
    

    # visit_duration=max(time_visit)-min(time_visit)

    #I add the timeslice just to make sure I am using the same
    #definition Alain is using.
    
    visit_duration=time_visit[-1]-time_visit[0]+timeslice
    




    # print "visit_duaration is, ", visit_duration

    
    return [visit_duration, visitor_id,time_visit[0],time_visit[-1],\
            tag_recover, boot_recover]


def visit_duration_many_tags(my_data_arr, timeslice,min_dur, max_dur):
    
    visitor_id_list=s.unique1d(my_data_arr[:,1])

    visit_flux=s.arange(1)


    visit_arr=s.arange(len(visitor_id_list))

    visit_arr_extended=s.arange(6*len(visitor_id_list)).\
                        reshape(len(visitor_id_list),6)

    my_black_overall=s.zeros(0)


    for i in xrange(len(visitor_id_list)):
        visitor_id=visitor_id_list[i]
        visit_data=visit_duration_single_tag(my_data_arr,visitor_id, timeslice)
        visit_arr[i]=visit_data[0]
        # print "visit_duration_single_tag(data_arr,visitor_id) is, ", \
        #       visit_duration_single_tag(data_arr,visitor_id)

        visit_arr_extended[i,0]=visit_data[4]
        visit_arr_extended[i,1]=visit_data[5]
        visit_arr_extended[i,2]=visit_data[0]
        visit_arr_extended[i,3]=visit_data[2]
        visit_arr_extended[i,4]=visit_data[3]
        visit_arr_extended[i,5]=visit_data[1]

        my_black=generate_blacklist(visitor_id, visit_data[0],min_dur, max_dur )
        my_black_overall=s.hstack((my_black_overall,my_black))


        
    visit_flux[:]=len(visit_arr)
        
    return [visit_arr,visit_flux,visit_arr_extended, my_black_overall] 

#here boot_time is simply the dummy argument of a function, but I deal
#only with contact protocols

def generate_iteration_grid(boot_time_non_unique,\
                            number_intervals,interval_duration,t_ini,t_end):
    time_unique=s.unique1d(boot_time_non_unique)

    # time_ini=time_unique[0]+ini_gap

    
    if (t_ini>0):
        time_ini=t_ini
    else:
        time_ini=time_unique[0]

    
    print "time_ini is, ", time_ini

    if (number_intervals<=0 and  interval_duration<=0):
        print "Error in calling the function"
        #break

    if (number_intervals>0 and  interval_duration>0):
        print "Error in calling the function"
        #break
    if (number_intervals>0 and interval_duration<=0):
        if (t_end<=0):
            time_grid=s.linspace(time_ini,time_unique[-1],number_intervals)
        elif (t_end>0):
            time_grid=s.linspace(time_ini,t_end,number_intervals)
            

    if (number_intervals<=0 and interval_duration>0):
        if (t_end<=0):
            time_grid=s.arange(time_ini,time_unique[-1],interval_duration)
        if (t_end>0):
            time_grid=s.arange(time_ini,t_end,interval_duration)
        

    return (time_grid.astype("int64"))


def generate_blacklist(visitor_id, visit_duration,min_dur, max_dur ):
    if (visit_duration<min_dur or visit_duration>max_dur):
        return (visitor_id)
    else:
        return (s.zeros(0).astype("int64"))
    
        
     

#Parameters to be used in the filtering of the visits        
        

min_dur= 60*5  #5 min
max_dur=60*180  #180 min
min_visits_in_period=30

timeslice=20 #needed to fix the visit durations



f = open(sys.argv[1])
data_arr = [map(int, string.split(line)) for line in f.readlines()]
f.close()

data_arr = s.array(data_arr, dtype="int64")


#I should read the disambiguated network representation as an edgelist
#into data_arr

#now a test

#p.save("data_arr_rewritten.dat", data_arr, fmt="%d")

print "I finished reading the file"

boot_time_non_unique=data_arr[:,0]

t_ini=1246233600   
t_end=t_ini+86400*3+1
number_intervals=-12
interval_duration= 86400 #i.e. 1 day #604800 #i.e. one week in seconds

time_grid=generate_iteration_grid(boot_time_non_unique,\
                            (number_intervals+1),interval_duration,t_ini,t_end)

print "len(time_grid) is, ", len(time_grid)
print "time_grid[0] and time_grid[-1] are, ",  time_grid[0], time_grid[-1]

n.savetxt("time_grid.dat", time_grid, fmt="%d")


file_list=-s.ones(len(time_grid))

blacklist_file_list=-s.ones(len(time_grid))


raw_mean=-s.ones(len(time_grid))
refined_mean=-s.ones(len(time_grid))

number_visit_raw=-s.ones(len(time_grid))

number_visit_refined=-s.ones(len(time_grid))


#print "file_list is, ", file_list

for i in xrange(len(time_grid)-1):

    print "i+1 is, ", i+1

    time_sel=s.where((boot_time_non_unique>= time_grid[i]) & \
                     (boot_time_non_unique< time_grid[i+1]))[0]
    
    
    data_arr_sel=data_arr[time_sel,:]

    # if (i==20):
    #     p.save("data_arr_sel_ini.dat",data_arr_sel,fmt="%d")
    #     p.save("time_sel.dat",time_sel+1,fmt="%d")


    #Now I need to change the shape of data_arr_sel

    data_arr_sel=duplicate_and_mix_array(data_arr_sel)

    # if (i==20):
    #     p.save("data_arr_sel_ini_reshaped.dat",data_arr_sel,fmt="%d")


    #The following commented part has been changed into a comment since it
    #was a mistake: getting rid of those lines was creating a mess...
    #I am not even sure that the function remove_equal_lines exactly
    #works here...

    #Now I also need to get rid of repeated entries corresponding to
    #self interactions

    # data_arr_sel=remove_equal_lines(data_arr_sel)[0]

    # if (i==20):
    #     p.save("data_arr_sel_remove_lines.dat", data_arr_sel, fmt="%d")

    museum_open=1 #flag telling me whether I collected data or not in a
    #certain time period



    if (len(data_arr_sel)>0): #do not do anything if nobody comes to the museum!

        # print "data_arr_sel is, ", data_arr_sel
        
        all_duration_and_all_counts=visit_duration_many_tags(data_arr_sel,timeslice,min_dur, max_dur)


        filename="all_visit_duration_%01d"%(i+1)
        filename=filename+"_.dat"

        all_durations=all_duration_and_all_counts[0]

        all_durations_selected=all_durations[s.where((all_durations>=min_dur) \
                                      & (all_durations<=max_dur) )]

        number_visit_refined[i]=len(all_durations_selected)
        
        raw_mean[i]=s.mean(all_durations)
        if (len(all_durations_selected)>0): #careful! In certain days
            #nothing may survive!
            refined_mean[i]=s.mean(all_durations_selected)

        #print "len(all_durations) is, ", len(all_durations)


        n.savetxt(filename,all_durations , fmt='%d')

        #Now save extended visit info like Alain does
        
        filename="all_visit_duration_extended_%01d"%(i+1)
        filename=filename+"_.dat"

        all_durations_extended=all_duration_and_all_counts[2]

        my_ord=s.argsort(all_durations_extended[:,0])

        all_durations_extended=all_durations_extended[my_ord,:]
        n.savetxt(filename,all_durations_extended , fmt='%d')


        # #Now do some polishing of the extended visit information

        # my_ord=s.where(all_durations_extended[:,2]>0)[0]
        # all_durations_extended=all_durations_extended[my_ord]

        filename="all_counts_%01d"%(i+1)
        filename=filename+"_.dat"

        all_counts=all_duration_and_all_counts[1]

        number_visit_raw[i]=all_counts

        print "all_counts is, ", all_counts

        n.savetxt(filename,all_counts , fmt='%d')



        filename="blacklist_%01d"%(i+1)
        filename=filename+"_.dat"

        blacklisted_tags=all_duration_and_all_counts[3]

        if (len(blacklisted_tags)>0):
        
            n.savetxt(filename,blacklisted_tags , fmt='%d')

            blacklist_file_list[i]=i+1

        file_list[i]=i+1

    else:
        museum_open=-2


    # if (museum_open>0):
    #     file_list[i]=i+1
    # else:
        print "i+1 for an empty period is, ", i+1
        print "time_grid[i] and time_grid[i+1] are, ", time_grid[i], \
              time_grid[i+1]
    






file_list_ini=s.copy(file_list)

sel=s.where(file_list>0)
    
file_list=file_list[sel]

#print "file_list is, ", file_list


n.savetxt("file_list.dat", file_list, fmt='%d')

sel=s.where(blacklist_file_list>0)

blacklist_file_list=blacklist_file_list[sel]

n.savetxt("blacklist_file_list.dat", blacklist_file_list, fmt='%d')


ini_raw_mean=s.copy(raw_mean)

raw_mean=raw_mean[sel]
n.savetxt("raw_mean.dat", raw_mean, fmt='%d')
n.savetxt("raw_mean_in_minutes.dat", raw_mean/60.)




#Now this is tricky: there can be certain days when I have no visits after
#filtering!

sel=s.where(refined_mean>0)

ini_refined_mean=s.copy(refined_mean)

refined_mean=refined_mean[sel]
n.savetxt("refined_mean.dat", refined_mean , fmt='%d')

n.savetxt("refined_mean_in_minutes.dat", refined_mean/60. )


n.savetxt("refined_file_list.dat", file_list_ini[sel], fmt='%d')




# fig = p.figure()
# axes = fig.gca()


# axes.plot(file_list_ini[sel],refined_mean/60.,"ro",\
#           label="filter on min/max",linewidth=2.)

# axes.plot(file_list_ini[sel], ini_raw_mean[sel]/60.,"bx",linewidth=2.,\
#           label="raw data (same days)" )


# axes.legend()
# p.ylim((5,90))
# p.xlabel('Time (days)')
# p.ylabel('Mean Visit Duration')
# p.savefig("comparison_visit_duration_same_days.pdf")

# p.clf()    






# fig = p.figure()
# axes = fig.gca()


# axes.plot(file_list_ini[sel],\
#    abs(refined_mean/60.-ini_raw_mean[sel]/60.)/(ini_raw_mean[sel]/60.)*100,"ro"\
#     ,linewidth=2.)

# p.xlabel('Time (days)')
# p.ylabel('Relative % difference filtered vs raw')
# p.savefig("relative_difference_same_days.pdf")

# p.clf()    





# fig = p.figure()
# axes = fig.gca()


# axes.plot(file_list_ini[sel],refined_mean/60.,"ro",\
#           label="filter on min/max",linewidth=2.)

# axes.plot(file_list_ini[sel],refined_mean/60.,"r" \
#           ,linewidth=2.)


# # axes.plot(file_list, raw_mean/60.,"b",linewidth=2.,\
# #           label="raw data" )


# axes.legend()
# p.xlabel('Time (days)')
# p.ylabel('Mean Visit Duration')
# p.savefig("filtered_visit_duration.pdf")
 
# p.clf()    



#Now choose to take statistics only on days when there is a minumum of recorded
#visits

sel=s.where(number_visit_refined>=min_visits_in_period)


# fig = p.figure()
# axes = fig.gca()


# axes.plot(file_list_ini[sel],ini_refined_mean[sel]/60.,"ro",\
#           label="filter on min/max",linewidth=2.)

# axes.plot(file_list_ini[sel], ini_raw_mean[sel]/60.,"bx",linewidth=2.,\
#           label="raw data (same days)" )

# p.title("Select on minimum number of visits per day")
# axes.legend()
# p.ylim((5,90))
# p.xlabel('Time (days)')
# p.ylabel('Mean Visit Duration')
# p.savefig("comparison_visit_duration_same_days_and_minimum_number_visits.pdf")

# p.clf()    


print "So far so good"