diff --git a/crosscorrelation.Rmd b/crosscorrelation.Rmd new file mode 100644 index 00000000..c5d296a7 --- /dev/null +++ b/crosscorrelation.Rmd @@ -0,0 +1,384 @@ +--- +title: "uploadid" +author: "Amenze" +date: "June 28, 2018" +output: html_document +--- + +```{r} +setwd("/Users/Amenze/Desktop/tidepool/refdata") +``` + + +```{r } +#library(data.table) +library(ggplot2) +library(plyr) +library(dplyr) +#using jmotif +library(jmotif) +#install.packages("RecordLinkage") +library(RecordLinkage) +#install.packages("PTXQC") +library("PTXQC") +library(stringr) +library(zoo) + + + +``` + + + + +```{r} + +# create a list from these files +list.filenames<-list.files("/Users/Amenze/Desktop/tidepool/refdata",pattern=".csv$") + + +#extract files based on duplicated utctime +for (i in 1:length(list.filenames)) +{ + patient<-read.csv(list.filenames[i]) + patient_cbg<-subset(patient,patient$type=="cbg") + if ((length(unique(patient_cbg$utcTime)))!=nrow(patient_cbg)) + write.csv(patient_cbg,paste0("/Users/Amenze/Desktop/tidepool/refdata/duplicated",list.filenames[i])) + + } +``` + + + + + +```{r} +##read in file and check for duplicated utc + +field<-c("deviceId","id","uploadId","utcTime","type","value") +patient<-read.csv("duplicated0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv")[,field] +patient$mgvalue<-patient$value*18.01559 +length(unique(patient$uploadId)) #1319 +length(unique(patient$id))==nrow(patient) ##True +length(unique(patient$utcTime))#72287 +length(unique(patient$utcTime))==nrow(patient) #False + + + +###subset patient file based on 5 uploadids +df1<-subset(patient,patient$uploadId=="upid_3c41703c2d3a8b97f479afdb6ccf799f") + +df2<-subset(patient,patient$uploadId=="upid_3fc32e5ad912a8ea7efced9151804bdb") + + +df3<-subset(patient,patient$uploadId=="upid_17db2d2a0ae0e02a12c0a5067e5fe85b") + + +df4<-subset(patient,patient$uploadId=="upid_5fad608cf32bd03a1cd56e3bb1fdb834") + + + +df5<-subset(patient,patient$uploadId=="upid_830c6de3e2ecbbec6fbad0cecc64bdf5") + +#plot values for each uploadid +par(mfrow=c(3,3)) + + +plot.ts(df1$value) +plot.ts(df2$value) +plot.ts(df3$value) +plot.ts(df4$value) +plot.ts(df5$value) + +``` + + + +```{r} +## input +#x:vector value for uploadid x +#y: vector value for uploadid y +#outputs +#longeststring:longest common substring match between discretize value of x and y respectively p1 & p2 +#percentage similarity +#alphaxy:the alphabet size + +stringcnvert<-function(x,y,alphaxy){ + if (length(x)!=length(y)){ + normvalue=x + } + else if(length(x)==length(y)){ + normvalue=x + } + normvalue.mean <- mean(normvalue) + normvalue.dev<-sd(normvalue) + xznorm<-(x - normvalue.mean)/normvalue.dev + yznorm<-(y - normvalue.mean)/normvalue.dev + y_paa1 = paa(xznorm,length(x)) ##we decided to use the exact length of the vector based on what we want to achieve + y_paa2 = paa(yznorm,length(y)) + xstringvalue<-series_to_string(y_paa1, alphaxy) + ystringvalue<-series_to_string(y_paa2, alphaxy) + p1<-xstringvalue + p2<-ystringvalue + longeststring<-LCSn(c(p1,p2)) + return (list(longeststring,p1,p2,(levenshteinSim(xstringvalue,ystringvalue)))) + + +} + + + + +##input +#values:output from stringcnvert function +##ouptputs +#stringvalue: 2 strings compared +#substringrep:matching substring that was compared, +#stringdetect:boolean eqivalent of string detected(TRUE:if detected, FALSE:not detected) +#stringcount:count of substrng in stringvalue +#loc:starting and ending indexes for substring in the two strings compared. + +duplicateindex<-function(values){ + stringsvalues <- as.character(c(values[2],values[3])) + substringrep <- as.character(values[1]) + stringdetect<-str_detect(stringsvalues,substringrep) + stringcount<-str_count(stringsvalues, substringrep) + loc <- str_locate(stringsvalues,substringrep ) + return(list(stringsvalues,substringrep,stringdetect,stringcount,loc)) +} + + + +##input +#x & y:dataframe +#strngindex: output from duplicateindex function(loc) +##ouptputs +#s1:numeric vector , +#s2:numeric vector, +#index1:indexes of s1, +#index2:indexes of s2 +# +extractindex<-function(x,y,strngindex){ + + s1<-x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],"value"] + s2<-y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],"value"] + indexs1<-row.names(x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],]) + indexs2<-row.names(y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],]) + + return(list(s1,s2,indexs1,indexs2)) + +} + + +###input +#x:a list of indexes +##ouptputs +#indexnum1:numeric equivalent of indexes + +extractvalue<-function(x){ + for (i in 1:length(x)[[1]]) + indexnum <- c(x[[i]]) + indexnum1<-as.numeric(indexnum) +return(indexnum1) +} + +##main +# +system.time(values<-stringcnvert(df1$mgvalue,df2$mgvalue,3))#http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. +#user system elapsed +# 0.11 0.00 0.11 + +system.time(strngindex<-duplicateindex(values)) +validate<-extractindex(df1,df2,strngindex) +seq1<-extractvalue(validate[3]) +seq2<-extractvalue(validate[4]) + +df1_mgvalue<-patient[seq1,"mgvalue"] +df2_mgvalue<-patient[seq2,"mgvalue"] +compare<-data.frame(df1_mgvalue,df2_mgvalue) + +# par(mfrow=c(1,2)) +# plot.ts(compare$df1_mgvalue) +# plot.ts(compare$df2_mgvalue) + +plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), + plot.type = "multiple", + col = c("red", "blue")) + + +sum(compare$df1_mgvalue-compare$df2_mgvalue) + + +``` + + + +```{r} +values<-stringcnvert(df3$mgvalue,df4$mgvalue,3 ) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. ##3 worked better for this run +strngindex<-duplicateindex(values) +validate<-extractindex(df3,df4,strngindex) +seq1<-extractvalue(validate[3]) +seq2<-extractvalue(validate[4]) + +df1_mgvalue<-patient[seq1,"mgvalue"] +df2_mgvalue<-patient[seq2,"mgvalue"] +compare<-data.frame(df1_mgvalue,df2_mgvalue) + +# par(mfrow=c(1,2)) +# plot.ts(compare$df1_mgvalue) +# plot.ts(compare$df2_mgvalue) + +plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), + plot.type = "multiple", + col = c("red", "blue")) + + +sum(compare$df1_mgvalue-compare$df2_mgvalue) +``` + + + + +```{r} +patient<-read.csv("duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv")[,field] +patient$mgvalue<-patient$value*18.01559 +length(unique(patient$uploadId)) #2 +length(unique(patient$id))==nrow(patient) ##True +length(unique(patient$utcTime))# 8899 +anyDuplicated(patient$utcTime) #4366 +length(unique(patient$utcTime))==nrow(patient) #False + + + +df1<-subset(patient,patient$uploadId=="2f61322480c841fd8679fe81e94930b2") + + +df2<-subset(patient,patient$uploadId=="c05970591b404518a1cbd64595d628e5") + + + +``` + + +```{r} +values<-stringcnvert(df1$mgvalue,df2$mgvalue,8) +strngindex<-duplicateindex(values) +validate<-extractindex(df1,df2,strngindex) +seq1<-extractvalue(validate[3]) +seq2<-extractvalue(validate[4]) + +df1_mgvalue<-patient[seq1,"mgvalue"] +df2_mgvalue<-patient[seq2,"mgvalue"] +compare<-data.frame(df1_mgvalue,df2_mgvalue) + + + +plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), + plot.type = "multiple", + col = c("red", "blue")) + +sum(compare$df1_mgvalue-compare$df2_mgvalue) + + +``` + + + + + + + + + + + + + + + +#cross correlation +```{r} +corre<-function(x,y,lagsize){ + corr<-ccf(x$mgvalue,y$mgvalue,max.lag=lagsize,plot=TRUE) + max.value<-max(corr$acf) + max.lag<-corr$lag[which(corr$acf==max.value)] + xlength<-length(x$mgvalue) + ylength<-length(y$mgvalue) + if ((max.lag==0) & (xlength0)& (xlength0)&(ylength0) & (xlength==ylength)){ + startcomputelength<-max.lag + xvalue=x[max.lag:length(x),] + yvalue=y[max.lag:length(y),] + } + if (max.lag<0){ + startcomputelength<-abs(max.lag) + xendcomputelength<-xlength-startcomputelength + yendcomputelength<-ylength-startcomputelength + xvalue<-x[xendcomputelength:1,] + yvalue<-y[ylength:(startcomputelength+1),] + } + + return (list(df1=xvalue,df2=yvalue,max_correlation=max.value)) + +} + +system.time(corr<-corre(df1,df2,10)) +df1adj<-corr$df1 +df2adj<-corr$df2 + +#ccf(df1$mgvalue,df2$mgvalue) + + +##extracte the index + +t1indexvalue<-row.names(df1adj) +t2indexvalue<-row.names(df2adj) +ts1<-patient[t1indexvalue,"mgvalue"] +ts2<-patient[t2indexvalue,"mgvalue"] + +plot.zoo(cbind(ts1,ts2), + plot.type = "multiple", + col = c("red", "blue")) +sum(ts1-ts2) + + +``` + + +```{r} +corr<-corre(df3,df4,10) +df1adj<-corr$df1 +df2adj<-corr$df2 + + + +``` + + + diff --git a/deduplication.Rmd b/deduplication.Rmd new file mode 100644 index 00000000..159c9729 --- /dev/null +++ b/deduplication.Rmd @@ -0,0 +1,196 @@ +--- +title: "De-duplication" +author: "Amenze Okpah" +date: "June 28, 2018" +output: html_document +--- + +compare similarity between two numeric vectors using Symbolic Aggregate approXimation (SAX). +Symbolic Aggregate approXimation (SAX) algorithm application to the input time series transforms its into a strings. +The algoithm was proposed by Lin et al.) and extends the PAA-based approach inheriting the original algorithm simplicity and low computational complexity while providing satisfactory sensitivity and selectivity in range query processing. Moreover, the use of a symbolic representation opened a door to the existing wealth of data-structures and string-manipulation algorithms in computer science such as hashing, regular expression, pattern matching, suffix trees, and grammatical inference. +#http://www.cs.ucr.edu/~eamonn/SAX.htm +#https://jmotif.github.io/sax-vsm_site/morea/algorithm/PAA.html + +```{r} +# create a list from these files +dir <- choose.dir(default = "", caption = "Select folder") +list.filenames<-list.files(dir,pattern=".csv$") +#extract files based on duplicated utctime +for (i in 1:length(list.filenames)) +{ + patient<-read.csv(list.filenames[i]) + patient_cbg<-subset(patient,patient$type=="cbg") + if ((length(unique(patient_cbg$utcTime)))!=nrow(patient_cbg)) + write.csv(patient_cbg,paste0(dir,list.filenames[i])) + } +``` + # Coverts two vectors to their string equivalent + # Args: + # x:vector of numeric values for uploadid1 + # y:vector of numeric values for uploadid2 + # alpha.Size: alphabet size(number of character that will be used to discretized numeric values) + # Returns: + # longeststring:longest common substring match between discretized valuea of x and y respectively stringX,stringY + # percentage similarity between vector x and y + +#Note: since PAA takes normalized values,vectors are normalized using the mean and standard deviation of either vector (x or y) +#PAA:the length of PAA values are fixed to length of the vectors to avoid reducing the dimensions since all values are needed to check for duplication. +```{r} +StringConvert <- function(x, y, alpha.Size){ + if (length(x) != length(y)){ + normvalue = x + } + else if(length(x) == length(y)){ + normvalue = x + } + normvalue.Mean <- mean(normvalue) + normvalue.Dev <- sd(normvalue) + xnormalized <- (x - normvalue.Mean) / normvalue.Dev + ynormalized <- (y - normvalue.Mean) / normvalue.Dev + X.PAA = paa(xnormalized, length(x)) + y.PAA = paa(ynormalized, length(y)) + xString.Value <- series_to_string(X.PAA, alpha.Size) + yString.Value <- series_to_string(y.PAA, alpha.Size) + stringX <- xString.Value + stringY <- yString.Value + longeststring<-LCSn(c(stringX,stringY)) + return (list(longeststring, stringX, stringY, (levenshteinSim(xString.Value, yString.Value)))) +} +``` +#computes the range of matching character between string values and common subsequence + Args: + values:output from stringcnvert function + Returns: + Loc:Matching Ranges +```{r} +IndexRange<-function(values){ + strings.Values <- as.character(c(values[2],values[3])) + common.SubSequence <- as.character(values[1]) + loc <- str_locate(strings.Values, common.SubSequence) + return(list(loc)) +} +``` +##computes the duplicated values for each vectors and their indexes + # Args: + # x & y: vectors + # Returns: + # duplicated.X:duplicated value for vextor x + # duplicated.Y:duplicated value for vextor y + # index.X:duplicated values index for vector x + # index.Y:duplicated values index for vector y +```{r} +ExtractDuplicateIndex <- function(x, y, IndexRange){ + duplicate.X <- x[IndexRange[[1]][1,][1] : IndexRange[[1]][1,][2], "value"] + duplicate.Y <- y[IndexRange[[1]][2,][1] : IndexRange[[1]][2,][2], "value"] + index.X <- row.names(x[IndexRange[[1]][1,][1] : IndexRange[[1]][1,][2],]) + index.Y <- row.names(y[IndexRange[[1]][2,][1] : IndexRange[[1]][2,][2],]) +return(list(duplicate.X, duplicate.Y, index.X, index.Y)) +} +``` +##computes a list of indexes + Args: + x: vectors + Returns: + duplicate.Index:duplicated value index +```{r} +ExtractVectorIndex<-function(x){ + for (i in 1:length(x)) + indexnum <- c(x[[i]]) + duplicate.Index<-as.numeric(indexnum) +return(duplicate.Index) +} +``` +#import files and subset data based on uploadIds to test function + +```{r} +##read in file and check for duplicated utcTime +field <- c("deviceId", "id", "uploadId", "utcTime", "type", "value") +patient <- read.csv("FileName ")[,field] #0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv +patient$mgvalue <- patient$value*18.01559 + +#check for unique uploadIds +uniqueid <- function(df){ + for (i in df["uploadId"]){ + Id <- unique(df$utcTime) + return (Id) + } +} +###subset patient file based on 5 uploadids +df1 <- subset(patient, patient$uploadId == "upid_3c41703c2d3a8b97f479afdb6ccf799f") +df2 <- subset(patient, patient$uploadId == "upid_3fc32e5ad912a8ea7efced9151804bdb") +df3 <- subset(patient, patient$uploadId == "upid_17db2d2a0ae0e02a12c0a5067e5fe85b") +df4 <- subset(patient, patient$uploadId == "upid_5fad608cf32bd03a1cd56e3bb1fdb834") +df5 <- subset(patient, patient$uploadId == "upid_830c6de3e2ecbbec6fbad0cecc64bdf5") + +#plot values for each uploadid +par(mfrow=c(3, 3)) +plot.ts(df1$value) +plot.ts(df2$value) +plot.ts(df3$value) +plot.ts(df4$value) +plot.ts(df5$value) + +``` + +```{r} +system.time(values <- StringConvert(df1$mgvalue, df2$mgvalue,3)) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. +system.time(strngindex <- IndexRange(values)) +validate <- ExtractDuplicateIndex(df1, df2, strngindex) +seq1 <- ExtractVectorIndex(validate[3]) +seq2 <- ExtractVectorIndex(validate[4]) +df1_mgvalue <- patient[seq1,"value"] +df2_mgvalue <- patient[seq2,"value"] +compare <- data.frame(df1_mgvalue,df2_mgvalue) + +#plot duplicated values +plot.zoo(cbind(compare$df1_mgvalue, compare$df2_mgvalue), + plot.type = "multiple", + col = c("red", "blue")) +sum(compare$df1_mgvalue - compare$df2_mgvalue) +``` + +```{r} +values <- StringConvert(df3$mgvalue, df4$mgvalue,3 ) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. ##3 worked better for this run +strngindex <- IndexRange(values) +validate <- ExtractDuplicateIndex(df3,df4,strngindex) +seq1 <- ExtractVectorIndex(validate[3]) +seq2 <- ExtractVectorIndex(validate[4]) + +df1_mgvalue <- patient[seq1,"value"] +df2_mgvalue <- patient[seq2,"value"] +compare <- data.frame(df1_mgvalue, df2_mgvalue) +plot.zoo(cbind(compare$df1_mgvalue, compare$df2_mgvalue), + plot.type = "multiple", + col = c("red", "blue")) +sum(compare$df1_mgvalue - compare$df2_mgvalue) +``` + +```{r} +patient <- read.csv("filename")[,field] #0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv +patient$mgvalue <- patient$value*18.01559 +length(unique(patient$uploadId)) +length(unique(patient$id))==nrow(patient) +length(unique(patient$utcTime)) +anyDuplicated(patient$utcTime) +length(unique(patient$utcTime))==nrow(patient) +df1<-subset(patient,patient$uploadId=="2f61322480c841fd8679fe81e94930b2") +df2<-subset(patient,patient$uploadId=="c05970591b404518a1cbd64595d628e5") +``` + +```{r} +values <- StringConvert(df1$mgvalue,df2$mgvalue,3) +strngindex <- IndexRange(values) +validate <- ExtractDuplicateIndex(df1,df2,strngindex) +seq1 <- ExtractVectorIndex(validate[3]) +seq2 <- ExtractVectorIndex(validate[4]) + +df1_mgvalue<-patient[seq1,"value"] +df2_mgvalue<-patient[seq2,"value"] +compare<-data.frame(df1_mgvalue,df2_mgvalue) + +plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), + plot.type = "multiple", + col = c("red", "blue")) +sum(compare$df1_mgvalue-compare$df2_mgvalue) + +``` diff --git a/deduplicationFunction.R b/deduplicationFunction.R new file mode 100644 index 00000000..1f5a0e48 --- /dev/null +++ b/deduplicationFunction.R @@ -0,0 +1,72 @@ +StringConvert <- function(x, y, alpha.Size){ + # + # Coverts two vectors string equivalent + # Args: + # x:vector value for uploadid x + # y:vector value for uploadid y + # alpha.Size: Alphabet size. + # Returns: + # longeststring:longest common substring match between discretized value of x and y respectively stringX,stringY + # percentage similarity between vector x and y + if (length(x) != length(y)){ + normvalue = x + } + else if(length(x) == length(y)){ + normvalue = x + } + normvalue.Mean <- mean(normvalue) + normvalue.Dev <- sd(normvalue) + xnormalized <- (x - normvalue.Mean) / normvalue.Dev + ynormalized <- (y - normvalue.Mean) / normvalue.Dev + X.PAA = paa(xnormalized, length(x)) + y.PAA = paa(ynormalized, length(y)) + xString.Value <- series_to_string(X.PAA, alpha.Size) + yString.Value <- series_to_string(y.PAA, alpha.Size) + stringX <- xString.Value + stringY <- yString.Value + longeststring<-LCSn(c(stringX,stringY)) + return (list(longeststring, stringX, stringY, (levenshteinSim(xString.Value, yString.Value)))) +} + + +IndexRange<- function(values){ + #computes the range of matching character between string values and common subsequence + # Args: + # values:output from stringcnvert function + # Returns: + # Loc:Matching Ranges + strings.Values <- as.character(c(values[2],values[3])) + common.SubSequence <- as.character(values[1]) + loc <- str_locate(strings.Values, common.SubSequence) + return(list(loc)) +} + + +ExtractDuplicateIndex <- function(x, y, IndexRange){ + ##computes the duplicated values for each vectors and their indexes + # Args: + # x & y: vectors + # Returns: + # duplicated.X:duplicated value for vextor x + # duplicated.Y:duplicated value for vextor y + # index.X:duplicated values index for vector x + # index.Y:duplicated values index for vector y + duplicate.X <- x[IndexRange[[1]][1,][1] : IndexRange[[1]][1,][2], "value"] + duplicate.Y <- y[IndexRange[[1]][2,][1] : IndexRange[[1]][2,][2], "value"] + index.X <- row.names(x[IndexRange[[1]][1,][1] : IndexRange[[1]][1,][2],]) + index.Y <- row.names(y[IndexRange[[1]][2,][1] : IndexRange[[1]][2,][2],]) + return(list(duplicate.X, duplicate.Y, index.X, index.Y)) +} + + +ExtractVectorIndex<- function(x){ + ##computes a list of indexes + # Args: + # x: vectors + # Returns: + # duplicate.Index:duplicated value index + for (i in 1:length(x)) + indexnum <- c(x[[i]]) + duplicate.Index<-as.numeric(indexnum) + return(duplicate.Index) +} \ No newline at end of file diff --git a/deduplication_distance.py b/deduplication_distance.py new file mode 100644 index 00000000..40ddffd3 --- /dev/null +++ b/deduplication_distance.py @@ -0,0 +1,337 @@ +import numpy as np +import pandas as pd +""" +Data Deduplication of Continous blood glucose. Final output are indexes of duplicated values, duplicated values and a plot of the values +""" +""" +Test data from two files +files:0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv + 0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv +""" +#Test data1 +##data = pd.read_csv("C:/Python27/test2.csv",delimiter=',') +##print data +##df1=data.loc[data['alp'] =='x', ['value']] +##df2=data.loc[data['alp'] =='y', ['value']] +#Test data2 +#data = pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/data.csv",delimiter=',') +##df1 = data.loc[data['uploadId'] =='upid_3c41703c2d3a8b97f479afdb6ccf799f', ['utcTime','value']] +##df2 = data.loc[data['uploadId'] =='upid_3fc32e5ad912a8ea7efced9151804bdb', ['utcTime','value']] +#Test data3 +##df1 = data.loc[data['uploadId'] =='upid_17db2d2a0ae0e02a12c0a5067e5fe85b', ['utcTime','value']] +##df2 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']] +#Test data4 +##df1 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']] +##df2 = data.loc[data['uploadId'] =='upid_830c6de3e2ecbbec6fbad0cecc64bdf5', ['utcTime','value']] + +#Test data5 +data=pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c2.csv",delimiter=',') +df1 = data.loc[data['uploadId'] =='2f61322480c841fd8679fe81e94930b2', ['utcTime','value']] +df2 = data.loc[data['uploadId'] =='c05970591b404518a1cbd64595d628e5', ['utcTime','value']] + +def Distances(x,y): + """ + Compute the distance matrix for vector x and y + Args: + x:vector value for uploadid x + y:vector value for uploadid y + Returns: + distances: distance matrix of x and y + """ + if len(y) > len(x): + leny = len(y) + lenx = len(x) + xval = x + yval = y + elif len(x) > len(y): + leny = len(x) + lenx = len(y) + xval = y + yval = x + elif len(y) == len(x): + lenx = len(x) + leny = len(y) + xval = x + yval = y + distances= [[0] * lenx for i in range(leny)] + for i in range(leny): + for j in range(lenx): + distances[i][j] = ((xval[j])-(yval[i]))**2 + return distances + +def DiagonalList(dis): + """ + Find the diagonal with the highest count of zero and the diagonal start index + Args: + dis: distance matrix + Returns: + diagonal: diagonal with higest count of zero + diagonalStartIndex:Start Index of diagonal + """ + matrix=np.array(dis) + j=-len(dis) + x=len(dis[0])+1 + highestCount=0 + for i in range(len(dis[0])-1,j,-1): + arr = matrix.diagonal(i) + countZero = (arr == 0).sum() + if countZero >= highestCount: + highestCount = countZero + diagonal=arr + diagonalStartIndex=abs(i) + return (diagonal,diagonalStartIndex) + +def DiagonalZero(disMatrix,ts1,ts2,startindex): + """ + Compute the diagonal Index with the highest count of zero (output from DiagonalList) + Args: + disMatrix: distance matrix + ts1:Vector 1 + ts2:vector 2 + startindex: the start index for the diagonal + Returns: + dia.Index:diagonal Index + dia.value: diagonal value + """ + diaIndex=[] + diaValue=[] + if len(ts2) > len(ts1): + leny = len(ts2) + j = len(ts1)-1 + elif len(ts1) > len(ts2): + leny = len(ts1) + j = len(ts2)-1 + elif len(ts1) == len(ts2): + lenx = len(ts1) + leny = len(ts2) + j = len(ts1)-1 + i = startindex + k = 0 + while i < leny and k <= j: + if disMatrix[i][k] >= 0: + diaIndex += [[i,k]] + diaValue += [disMatrix[i][k]] + k = k+1 + i = i+1 + return (diaIndex,diaValue) + +def zero_runs(diaValue): #https://stackoverflow.com/questions/24885092/finding-the-consecutive-zeros-in-a-numpy-array + """ + Args: + diaValue: diagonal values returned from function diagonalzero + Returns: + ranges: list of consecutive zero ranges in the diagonal + """ + iszero = np.concatenate(([0], np.equal(diaValue, 0).view(np.int8), [0])) + absdiff = np.abs(np.diff(iszero)) + ranges = np.where(absdiff == 1)[0].reshape(-1, 2) + return ranges +def CountZero(runs): + """ + Args: + runs: list of start and stop index of the consecutive zeros in an array + Returns: + totalCount:returns the count of consecutive zero + countIndex:list index with max zeros + """ + maxcount=0 + for i in range(len(runs)): + count=runs[i][1]-runs[i][0] + if count>=maxcount: + maxcount=count + countIndex=i + totalCount=maxcount + return (totalCount,countIndex) + + +def ZeroIndex(runs,dia,runindex): + """ + #accumulate indexes + Args: + dia:diagonal indexes returned from function DiagonalZero + runs:start and stop indexes + Results: + indexlst: list of indexes + """ + i=runs[0] + j=runs[1] + indexlst=[] + for i in range(i,j): + indexlst+=[dia[i]] + + return indexlst + +def DupIndex(x,y,indexzero): + """ + Args: + x & y : vectors + indexzero:Output from function zeroindex + Result: + xindex& yindex:matrix indexes for vector x and y + ts1dup &ts2dup: duplicated values + """ + yvalue=[] + xvalue=[] + xdup=[] + ydup=[] + if len(y)>len(x): + yval=y + xval=x + elif len(x)>len(y): + xval=y + yval=x + elif len(y)==len(x): + xval=x + yval=y + for i in range(len(indexzero)): + yvalue+=[indexzero[i][0]] + yindex=yvalue + for i in range(len(indexzero)): + xvalue+=[indexzero[i][1]] + xindex=xvalue + for i in range(len(xindex)): + val=xindex[i] + xdup+=[xval[val]] + ts1dup=xdup + ts1dup.reverse() + for i in range(len(yindex)): + val=yindex[i] + ydup+=[yval[val]] + ts2dup=ydup + ts2dup.reverse() + return(xindex,yindex,ts1dup,ts2dup) + +def lookupdict(x): + """ + Args: + x: vector + Returns: + indexDict: a dictionary holding the original indexes of vector + """ + indexDict={} + for i in range(len(x)): + indexDict[i]=x[i] + return indexDict + +def ExtracteIndex(xDict,yDict,xIndex,yIndex): + """ + Args: + xDict & yDict: dictionary from function lookupdict + xIndex $ yIndex: line up indexes + Returns: + xIndexList,yIndexList: a list of original duplicate indexes + """ + xIndexList=[] + yIndexList=[] + if len(yIndex) > len(xIndex): + y=yIndex + x=xIndex + elif len(xIndex)>len(yIndex): + y=xIndex + x=yIndex + # x.Dict=x + elif len(xIndex)==len(yIndex): + y=yIndex + x=xIndex + if len(yDict)>len(xDict): + xdict=xDict + ydict=yDict + elif len(xDict)> len(yDict): + xdict=yDict + ydict=xDict + elif len(xDict)==len(yDict): + xdict=xDict + ydict=yDict + for i in xIndex: + xIndexList+=[xdict[i]] + for i in yIndex: + yIndexList+=[ydict[i]] + xIndexList.reverse() + yIndexList.reverse() + return (xIndexList,yIndexList) +import time +start_time = time.time() + +###main### +print "--------------vector one and two--------------------------" +ts1=np.array(df1['value']) +ts2=np.array(df2['value']) + +print "--------------original index for vector one and two--------------------------------------" +ts1index= df1.index +print "-----------------" +#print ts1index + +ts2index=df2.index +#print ts2index +#### +print "--------------------distance matrix---------------------" +dis=Distances(ts1,ts2) +#print dis +## +print "----------------list of diagonals ---------------------------------------------" +arr,index=DiagonalList(dis) +startindex=index +zeroarr=arr +#print startindex +#print zeroarr + +print "---------------diagonal index with zero---------------------------------------" +diaindex,diaval= DiagonalZero(dis,ts1,ts2,startindex) +diavals=diaval +diaindexes=diaindex +#print diavals +#print diaindexes + +runs=zero_runs(diavals) + +print "---list of indexes---" +#print runs + +print "----maximum count of zero-------------------------------------" +sumzero,i=CountZero(runs) +sumindex=i +#print sumindex +maxruns=runs[sumindex] +#print maxruns +zeroruns=sumzero +#print zeroruns +print "*******************" +###### +diazero=ZeroIndex(maxruns,diaindexes,sumindex) +#### +#print diazero +xindex2,yindex2,ts1value,ts2value=DupIndex(ts1,ts2,diazero) +###### +xindex=xindex2 +yindex=yindex2 +duplicatedts1=ts1value +duplicatedts2=ts2value +#print xindex +#print yindex +print duplicatedts1 +print"-----------------------dup1------------------------------------------------------" +print duplicatedts2 + +print "-----------dictionary for original index--------------------------------------------------" +originalindexts1=lookupdict(ts1index) +originalindexts2=lookupdict(ts2index) +#print originalindexts1 +#print originalindexts2 + + +print "--------------match to original index-------------------------" +print ExtracteIndex(originalindexts1,originalindexts2,xindex,yindex) + +print time.time() - start_time, "seconds" + +## +import matplotlib.pyplot as plt +plt.subplot(2, 1, 1) +plt.plot(duplicatedts1,'r-') +plt.ylabel('vector x') +plt.subplot(2, 1, 2) +plt.plot(duplicatedts2) +plt.ylabel('vector y') + +plt.show()