From 659a938e16c0700dcfb64b92d841316f77eade96 Mon Sep 17 00:00:00 2001 From: Amenze Okpah Date: Sat, 14 Jul 2018 21:22:07 -0700 Subject: [PATCH 1/9] depulication function first Scenario --- deduplication.Rmd | 296 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100644 deduplication.Rmd diff --git a/deduplication.Rmd b/deduplication.Rmd new file mode 100644 index 00000000..c25706e1 --- /dev/null +++ b/deduplication.Rmd @@ -0,0 +1,296 @@ +--- +title: "uploadid" +author: "Amenze" +date: "June 28, 2018" +output: html_document +--- + +```{r} +setwd("/Users/Amenze/Desktop/tidepool/refdata") +``` + + +```{r } +#library(data.table) +library(ggplot2) +library(plyr) +library(dplyr) +#using jmotif +library(jmotif) +#install.packages("RecordLinkage") +library(RecordLinkage) +#install.packages("PTXQC") +library("PTXQC") +library(stringr) +library(zoo) + + + +``` + + + + +```{r} + +# create a list from these files +list.filenames<-list.files("/Users/Amenze/Desktop/tidepool/refdata",pattern=".csv$") + + +#extract files based on duplicated utctime +for (i in 1:length(list.filenames)) +{ + patient<-read.csv(list.filenames[i]) + patient_cbg<-subset(patient,patient$type=="cbg") + if ((length(unique(patient_cbg$utcTime)))!=nrow(patient_cbg)) + write.csv(patient_cbg,paste0("/Users/Amenze/Desktop/tidepool/refdata/duplicated",list.filenames[i])) + + } +``` + + + + + +```{r} +##read in file and check for duplicated utc + +field<-c("deviceId","id","uploadId","utcTime","type","value") +patient<-read.csv("duplicated0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv")[,field] +patient$mgvalue<-patient$value*18.01559 +length(unique(patient$uploadId)) #1319 +length(unique(patient$id))==nrow(patient) ##True +length(unique(patient$utcTime))#72287 +length(unique(patient$utcTime))==nrow(patient) #False + + + +###subset patient file based on 5 uploadids +df1<-subset(patient,patient$uploadId=="upid_3c41703c2d3a8b97f479afdb6ccf799f") + +df2<-subset(patient,patient$uploadId=="upid_3fc32e5ad912a8ea7efced9151804bdb") + + +df3<-subset(patient,patient$uploadId=="upid_17db2d2a0ae0e02a12c0a5067e5fe85b") + + +df4<-subset(patient,patient$uploadId=="upid_5fad608cf32bd03a1cd56e3bb1fdb834") + + + +df5<-subset(patient,patient$uploadId=="upid_830c6de3e2ecbbec6fbad0cecc64bdf5") + +#plot values for each uploadid +par(mfrow=c(3,3)) + + +plot.ts(df1$value) +plot.ts(df2$value) +plot.ts(df3$value) +plot.ts(df4$value) +plot.ts(df5$value) + +``` + + + +```{r} +## input +#x:vector value for uploadid x +#y: vector value for uploadid y +#outputs +#longeststring:longest common substring match between discretize value of x and y respectively p1 & p2 +#percentage similarity +#alphaxy:the alphabet size + +stringcnvert<-function(x,y,alphaxy){ + if (length(x)!=length(y)){ + normvalue=x + } + else if(length(x)==length(y)){ + normvalue=x + } + normvalue.mean <- mean(normvalue) + normvalue.dev<-sd(normvalue) + xznorm<-(x - normvalue.mean)/normvalue.dev + yznorm<-(y - normvalue.mean)/normvalue.dev + y_paa1 = paa(xznorm,length(x)) ##we decided to use the exact length of the vector based on what we want to achieve + y_paa2 = paa(yznorm,length(y)) + xstringvalue<-series_to_string(y_paa1, alphaxy) + ystringvalue<-series_to_string(y_paa2, alphaxy) + p1<-xstringvalue + p2<-ystringvalue + longeststring<-LCSn(c(p1,p2)) + return (list(longeststring,p1,p2,(levenshteinSim(xstringvalue,ystringvalue)))) + + +} + + + + +##input +#values:output from stringcnvert function +##ouptputs +#stringvalue: 2 strings compared +#substringrep:matching substring that was compared, +#stringdetect:boolean eqivalent of string detected(TRUE:if detected, FALSE:not detected) +#stringcount:count of substrng in stringvalue +#loc:starting and ending indexes for substring in the two strings compared. + +duplicateindex<-function(values){ + stringsvalues <- as.character(c(values[2],values[3])) + substringrep <- as.character(values[1]) + stringdetect<-str_detect(stringsvalues,substringrep) + stringcount<-str_count(stringsvalues, substringrep) + loc <- str_locate(stringsvalues,substringrep ) + return(list(stringsvalues,substringrep,stringdetect,stringcount,loc)) +} + + + +##input +#x & y:dataframe +#strngindex: output from duplicateindex function(loc) +##ouptputs +#s1:numeric vector , +#s2:numeric vector, +#index1:indexes of s1, +#index2:indexes of s2 +# +extractindex<-function(x,y,strngindex){ + + s1<-x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],"value"] + s2<-y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],"value"] + indexs1<-row.names(x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],]) + indexs2<-row.names(y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],]) + + return(list(s1,s2,indexs1,indexs2)) + +} + + +###input +#x:a list of indexes +##ouptputs +#indexnum1:numeric equivalent of indexes + +extractvalue<-function(x){ + for (i in 1:length(x)[[1]]) + indexnum <- c(x[[i]]) + indexnum1<-as.numeric(indexnum) +return(indexnum1) +} + +##main +# +values<-stringcnvert(df1$mgvalue,df2$mgvalue,8) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. +strngindex<-duplicateindex(values) +validate<-extractindex(df1,df2,strngindex) +seq1<-extractvalue(validate[3]) +seq2<-extractvalue(validate[4]) + +df1_mgvalue<-patient[seq1,"mgvalue"] +df2_mgvalue<-patient[seq2,"mgvalue"] +compare<-data.frame(df1_mgvalue,df2_mgvalue) + +# par(mfrow=c(1,2)) +# plot.ts(compare$df1_mgvalue) +# plot.ts(compare$df2_mgvalue) + +plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), + plot.type = "multiple", + col = c("red", "blue")) + + +sum(compare$df1_mgvalue-compare$df2_mgvalue) + + +``` + + + +```{r} +values<-stringcnvert(df3$mgvalue,df4$mgvalue,3) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. ##3 worked better for this run +strngindex<-duplicateindex(values) +validate<-extractindex(df3,df4,strngindex) +seq1<-extractvalue(validate[3]) +seq2<-extractvalue(validate[4]) + +df1_mgvalue<-patient[seq1,"mgvalue"] +df2_mgvalue<-patient[seq2,"mgvalue"] +compare<-data.frame(df1_mgvalue,df2_mgvalue) + +# par(mfrow=c(1,2)) +# plot.ts(compare$df1_mgvalue) +# plot.ts(compare$df2_mgvalue) + +plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), + plot.type = "multiple", + col = c("red", "blue")) + + +sum(compare$df1_mgvalue-compare$df2_mgvalue) +``` + + + + +```{r} +patient<-read.csv("duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv")[,field] +patient$mgvalue<-patient$value*18.01559 +length(unique(patient$uploadId)) #2 +length(unique(patient$id))==nrow(patient) ##True +length(unique(patient$utcTime))# 8899 +anyDuplicated(patient$utcTime) #4366 +length(unique(patient$utcTime))==nrow(patient) #False + + + +df1<-subset(patient,patient$uploadId=="2f61322480c841fd8679fe81e94930b2") + + +df2<-subset(patient,patient$uploadId=="c05970591b404518a1cbd64595d628e5") + + + +``` + + +```{r} +values<-stringcnvert(df1$mgvalue,df2$mgvalue,8) +strngindex<-duplicateindex(values) +validate<-extractindex(df1,df2,strngindex) +seq1<-extractvalue(validate[3]) +seq2<-extractvalue(validate[4]) + +df1_mgvalue<-patient[seq1,"mgvalue"] +df2_mgvalue<-patient[seq2,"mgvalue"] +compare<-data.frame(df1_mgvalue,df2_mgvalue) + + + +plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), + plot.type = "multiple", + col = c("red", "blue")) + +sum(compare$df1_mgvalue-compare$df2_mgvalue) + + +``` + + + + + + + + + + + + + + + +#cross correlation From bb99788b0a10bc667df614ab99e5a5047788f75a Mon Sep 17 00:00:00 2001 From: Amenze Okpah Date: Tue, 17 Jul 2018 10:46:00 -0700 Subject: [PATCH 2/9] Cross correlation --- crosscorrelation.Rmd | 384 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 384 insertions(+) create mode 100644 crosscorrelation.Rmd diff --git a/crosscorrelation.Rmd b/crosscorrelation.Rmd new file mode 100644 index 00000000..c5d296a7 --- /dev/null +++ b/crosscorrelation.Rmd @@ -0,0 +1,384 @@ +--- +title: "uploadid" +author: "Amenze" +date: "June 28, 2018" +output: html_document +--- + +```{r} +setwd("/Users/Amenze/Desktop/tidepool/refdata") +``` + + +```{r } +#library(data.table) +library(ggplot2) +library(plyr) +library(dplyr) +#using jmotif +library(jmotif) +#install.packages("RecordLinkage") +library(RecordLinkage) +#install.packages("PTXQC") +library("PTXQC") +library(stringr) +library(zoo) + + + +``` + + + + +```{r} + +# create a list from these files +list.filenames<-list.files("/Users/Amenze/Desktop/tidepool/refdata",pattern=".csv$") + + +#extract files based on duplicated utctime +for (i in 1:length(list.filenames)) +{ + patient<-read.csv(list.filenames[i]) + patient_cbg<-subset(patient,patient$type=="cbg") + if ((length(unique(patient_cbg$utcTime)))!=nrow(patient_cbg)) + write.csv(patient_cbg,paste0("/Users/Amenze/Desktop/tidepool/refdata/duplicated",list.filenames[i])) + + } +``` + + + + + +```{r} +##read in file and check for duplicated utc + +field<-c("deviceId","id","uploadId","utcTime","type","value") +patient<-read.csv("duplicated0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv")[,field] +patient$mgvalue<-patient$value*18.01559 +length(unique(patient$uploadId)) #1319 +length(unique(patient$id))==nrow(patient) ##True +length(unique(patient$utcTime))#72287 +length(unique(patient$utcTime))==nrow(patient) #False + + + +###subset patient file based on 5 uploadids +df1<-subset(patient,patient$uploadId=="upid_3c41703c2d3a8b97f479afdb6ccf799f") + +df2<-subset(patient,patient$uploadId=="upid_3fc32e5ad912a8ea7efced9151804bdb") + + +df3<-subset(patient,patient$uploadId=="upid_17db2d2a0ae0e02a12c0a5067e5fe85b") + + +df4<-subset(patient,patient$uploadId=="upid_5fad608cf32bd03a1cd56e3bb1fdb834") + + + +df5<-subset(patient,patient$uploadId=="upid_830c6de3e2ecbbec6fbad0cecc64bdf5") + +#plot values for each uploadid +par(mfrow=c(3,3)) + + +plot.ts(df1$value) +plot.ts(df2$value) +plot.ts(df3$value) +plot.ts(df4$value) +plot.ts(df5$value) + +``` + + + +```{r} +## input +#x:vector value for uploadid x +#y: vector value for uploadid y +#outputs +#longeststring:longest common substring match between discretize value of x and y respectively p1 & p2 +#percentage similarity +#alphaxy:the alphabet size + +stringcnvert<-function(x,y,alphaxy){ + if (length(x)!=length(y)){ + normvalue=x + } + else if(length(x)==length(y)){ + normvalue=x + } + normvalue.mean <- mean(normvalue) + normvalue.dev<-sd(normvalue) + xznorm<-(x - normvalue.mean)/normvalue.dev + yznorm<-(y - normvalue.mean)/normvalue.dev + y_paa1 = paa(xznorm,length(x)) ##we decided to use the exact length of the vector based on what we want to achieve + y_paa2 = paa(yznorm,length(y)) + xstringvalue<-series_to_string(y_paa1, alphaxy) + ystringvalue<-series_to_string(y_paa2, alphaxy) + p1<-xstringvalue + p2<-ystringvalue + longeststring<-LCSn(c(p1,p2)) + return (list(longeststring,p1,p2,(levenshteinSim(xstringvalue,ystringvalue)))) + + +} + + + + +##input +#values:output from stringcnvert function +##ouptputs +#stringvalue: 2 strings compared +#substringrep:matching substring that was compared, +#stringdetect:boolean eqivalent of string detected(TRUE:if detected, FALSE:not detected) +#stringcount:count of substrng in stringvalue +#loc:starting and ending indexes for substring in the two strings compared. + +duplicateindex<-function(values){ + stringsvalues <- as.character(c(values[2],values[3])) + substringrep <- as.character(values[1]) + stringdetect<-str_detect(stringsvalues,substringrep) + stringcount<-str_count(stringsvalues, substringrep) + loc <- str_locate(stringsvalues,substringrep ) + return(list(stringsvalues,substringrep,stringdetect,stringcount,loc)) +} + + + +##input +#x & y:dataframe +#strngindex: output from duplicateindex function(loc) +##ouptputs +#s1:numeric vector , +#s2:numeric vector, +#index1:indexes of s1, +#index2:indexes of s2 +# +extractindex<-function(x,y,strngindex){ + + s1<-x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],"value"] + s2<-y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],"value"] + indexs1<-row.names(x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],]) + indexs2<-row.names(y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],]) + + return(list(s1,s2,indexs1,indexs2)) + +} + + +###input +#x:a list of indexes +##ouptputs +#indexnum1:numeric equivalent of indexes + +extractvalue<-function(x){ + for (i in 1:length(x)[[1]]) + indexnum <- c(x[[i]]) + indexnum1<-as.numeric(indexnum) +return(indexnum1) +} + +##main +# +system.time(values<-stringcnvert(df1$mgvalue,df2$mgvalue,3))#http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. +#user system elapsed +# 0.11 0.00 0.11 + +system.time(strngindex<-duplicateindex(values)) +validate<-extractindex(df1,df2,strngindex) +seq1<-extractvalue(validate[3]) +seq2<-extractvalue(validate[4]) + +df1_mgvalue<-patient[seq1,"mgvalue"] +df2_mgvalue<-patient[seq2,"mgvalue"] +compare<-data.frame(df1_mgvalue,df2_mgvalue) + +# par(mfrow=c(1,2)) +# plot.ts(compare$df1_mgvalue) +# plot.ts(compare$df2_mgvalue) + +plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), + plot.type = "multiple", + col = c("red", "blue")) + + +sum(compare$df1_mgvalue-compare$df2_mgvalue) + + +``` + + + +```{r} +values<-stringcnvert(df3$mgvalue,df4$mgvalue,3 ) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. ##3 worked better for this run +strngindex<-duplicateindex(values) +validate<-extractindex(df3,df4,strngindex) +seq1<-extractvalue(validate[3]) +seq2<-extractvalue(validate[4]) + +df1_mgvalue<-patient[seq1,"mgvalue"] +df2_mgvalue<-patient[seq2,"mgvalue"] +compare<-data.frame(df1_mgvalue,df2_mgvalue) + +# par(mfrow=c(1,2)) +# plot.ts(compare$df1_mgvalue) +# plot.ts(compare$df2_mgvalue) + +plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), + plot.type = "multiple", + col = c("red", "blue")) + + +sum(compare$df1_mgvalue-compare$df2_mgvalue) +``` + + + + +```{r} +patient<-read.csv("duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv")[,field] +patient$mgvalue<-patient$value*18.01559 +length(unique(patient$uploadId)) #2 +length(unique(patient$id))==nrow(patient) ##True +length(unique(patient$utcTime))# 8899 +anyDuplicated(patient$utcTime) #4366 +length(unique(patient$utcTime))==nrow(patient) #False + + + +df1<-subset(patient,patient$uploadId=="2f61322480c841fd8679fe81e94930b2") + + +df2<-subset(patient,patient$uploadId=="c05970591b404518a1cbd64595d628e5") + + + +``` + + +```{r} +values<-stringcnvert(df1$mgvalue,df2$mgvalue,8) +strngindex<-duplicateindex(values) +validate<-extractindex(df1,df2,strngindex) +seq1<-extractvalue(validate[3]) +seq2<-extractvalue(validate[4]) + +df1_mgvalue<-patient[seq1,"mgvalue"] +df2_mgvalue<-patient[seq2,"mgvalue"] +compare<-data.frame(df1_mgvalue,df2_mgvalue) + + + +plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), + plot.type = "multiple", + col = c("red", "blue")) + +sum(compare$df1_mgvalue-compare$df2_mgvalue) + + +``` + + + + + + + + + + + + + + + +#cross correlation +```{r} +corre<-function(x,y,lagsize){ + corr<-ccf(x$mgvalue,y$mgvalue,max.lag=lagsize,plot=TRUE) + max.value<-max(corr$acf) + max.lag<-corr$lag[which(corr$acf==max.value)] + xlength<-length(x$mgvalue) + ylength<-length(y$mgvalue) + if ((max.lag==0) & (xlength0)& (xlength0)&(ylength0) & (xlength==ylength)){ + startcomputelength<-max.lag + xvalue=x[max.lag:length(x),] + yvalue=y[max.lag:length(y),] + } + if (max.lag<0){ + startcomputelength<-abs(max.lag) + xendcomputelength<-xlength-startcomputelength + yendcomputelength<-ylength-startcomputelength + xvalue<-x[xendcomputelength:1,] + yvalue<-y[ylength:(startcomputelength+1),] + } + + return (list(df1=xvalue,df2=yvalue,max_correlation=max.value)) + +} + +system.time(corr<-corre(df1,df2,10)) +df1adj<-corr$df1 +df2adj<-corr$df2 + +#ccf(df1$mgvalue,df2$mgvalue) + + +##extracte the index + +t1indexvalue<-row.names(df1adj) +t2indexvalue<-row.names(df2adj) +ts1<-patient[t1indexvalue,"mgvalue"] +ts2<-patient[t2indexvalue,"mgvalue"] + +plot.zoo(cbind(ts1,ts2), + plot.type = "multiple", + col = c("red", "blue")) +sum(ts1-ts2) + + +``` + + +```{r} +corr<-corre(df3,df4,10) +df1adj<-corr$df1 +df2adj<-corr$df2 + + + +``` + + + From 360d1dcb68e2d5cfe2bb56b0e4fbcbb71ed9b4a4 Mon Sep 17 00:00:00 2001 From: Amenze Okpah Date: Fri, 10 Aug 2018 01:15:31 -0700 Subject: [PATCH 3/9] use distance matrix to find longest consecutive duplicate --- deduplication_distance.py | 339 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 339 insertions(+) create mode 100644 deduplication_distance.py diff --git a/deduplication_distance.py b/deduplication_distance.py new file mode 100644 index 00000000..b4369392 --- /dev/null +++ b/deduplication_distance.py @@ -0,0 +1,339 @@ +import numpy as np +import pandas as pd + +""" +Data Deduplication of Continous blood glucose. Final output are indexes of duplications, duplicated values and a plot of the values +""" +#Section one +""" +Test data comparing two uploadid's +files:0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv + 0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv +""" +##data = pd.read_csv("C:/Python27/test2.csv",delimiter=',') +##df1=data.loc[data['alp'] =='x', ['value']] +##df2=data.loc[data['alp'] =='y', ['value']] + + +data = pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/data.csv",delimiter=',') +##df1 = data.loc[data['uploadId'] =='upid_3c41703c2d3a8b97f479afdb6ccf799f', ['utcTime','value']] +##df2 = data.loc[data['uploadId'] =='upid_3fc32e5ad912a8ea7efced9151804bdb', ['utcTime','value']] + + +##df1 = data.loc[data['uploadId'] =='upid_17db2d2a0ae0e02a12c0a5067e5fe85b', ['utcTime','value']] +##df2 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']] + + +df1 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']] +df2 = data.loc[data['uploadId'] =='upid_830c6de3e2ecbbec6fbad0cecc64bdf5', ['utcTime','value']] + + + +##data=pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c2.csv",delimiter=',') +##df1 = data.loc[data['uploadId'] =='2f61322480c841fd8679fe81e94930b2', ['utcTime','value']] +##df2 = data.loc[data['uploadId'] =='c05970591b404518a1cbd64595d628e5', ['utcTime','value']] + + + + +def distances(x,y): + """ + input: + x & y: vectors of cbg values + + output + distance matrix of x and y + """ + if len(y)>len(x): + leny=len(y) + lenx=len(x) + xval=x + yval=y + elif len(x)>len(y): + leny=len(x) + lenx=len(y) + xval=y + yval=x + elif len(y)==len(x): + lenx=len(x) + leny=len(y) + xval=x + yval=y + distances= [[0] * lenx for i in range(leny)] + for i in range(leny): + for j in range(lenx): + distances[i][j] = ((xval[j])-(yval[i]))**2 + return distances + + +def lstdiagonal(dis): + """ + input + dis: distance matrix + output:generate a list of diagonals + a:the diagonal array with max sum of zeroes. + w: start index + """ + matrix=np.array(dis) + j=-len(dis) + x=len(dis[0])+1 + longest_match=0 + for i in range(len(dis[0])-1,j,-1): + arr=matrix.diagonal(i) + nbr_zero=(arr == 0).sum() + if nbr_zero >= longest_match: + longest_match = nbr_zero + a=arr + w=abs(i) + return (a,w) + + + + + +def diagonalzero(df,ts1,ts2,startindex): + ''' + df: distance matrix + ts1:vector 1 + ts2:vector 2 + startindex: the start index for the diagonal + ouput: returns all values and index in diagonals with max zeroes + ''' + diaindex=[] + diaval=[] + if len(ts2)>len(ts1): + leny=len(ts2) + j=len(ts1)-1 + elif len(ts1)>len(ts2): + leny=len(ts1) + j=len(ts2)-1 + elif len(ts1)==len(ts2): + lenx=len(ts1) + leny=len(ts2) + j=len(ts1)-1 + i=startindex + k=0 + while i=0: + diaindex+=[[i,k]] + diaval+=[df[i][k]] + k=k+1 + i=i+1 + return (diaindex,diaval) + + + +def zero_runs(a): #https://stackoverflow.com/questions/24885092/finding-the-consecutive-zeros-in-a-numpy-array + """ + a: array output from diagonal zero + output: + Return the consecutive zero in the array + """ + iszero = np.concatenate(([0], np.equal(a, 0).view(np.int8), [0])) + absdiff = np.abs(np.diff(iszero)) + ranges = np.where(absdiff == 1)[0].reshape(-1, 2) + return ranges + + +def countzero(runs): + """ + runs: list of start and stop index of the consecutive zeros in an array + output:returns the start and stop index with max zeroes + ind: the index of the result within runs list + + """ + maxcount=0 + for i in range(len(runs)): + x=runs[i][1]-runs[i][0] + if x>=maxcount: + maxcount=x + ind=i + count=maxcount + return count,ind + + +def zeroindex(runs,dia,runindex): + ''' + dia:diagonal indexes returned from function diagonalzeros + runs:start and stop indexes + output"accumulate the indexes of consecutive zero + the index for the longest zeros" + ''' + i=runs[0] + j=runs[1] + indexlst=[] + for i in range(i,j): + indexlst+=[dia[i]] + + return indexlst + + + +def dupindex(x,y,diagonalzero): + """ + x & y : vectors + diagonalzero:Output from function zeroindex + output + line upindexes for vector x and y and the duplicate values + """ + yvalue=[] + xvalue=[] + xdup=[] + ydup=[] + if len(y)>len(x): + yval=y + xval=x + elif len(x)>len(y): + xval=y + yval=x + elif len(y)==len(x): + xval=x + yval=y + for i in range(len(diagonalzero)): + yvalue+=[diagonalzero[i][0]] + yindex=yvalue + for i in range(len(diagonalzero)): + xvalue+=[diagonalzero[i][1]] + xindex=xvalue + for i in range(len(xindex)): + val=xindex[i] + xdup+=[xval[val]] + ts1dup=xdup + ts1dup.reverse() + for i in range(len(yindex)): + val=yindex[i] + ydup+=[yval[val]] + ts2dup=ydup + ts2dup.reverse() + return(xindex,yindex,ts1dup,ts2dup) + +def lookupdict(x): + """ + x: vactor + output: a dictionary holding the original indexes of vector + """ + indexdict={} + for i in range(len(x)): + indexdict[i]=x[i] + return indexdict + + +##retrieve original index +def original(xval,yval,xind,yind): + """ + input: + xval & yval: dictionary from function lookupdict + xind $ yind: line up indexes + output: return a list of original duplicate indexes + """ + xlist=[] + ylist=[] + if len(yind) > len(xind): + y=yind + x=xind + elif len(xind)>len(xind): + y=xind + x=yind + xdict=x + elif len(xind)==len(yind): + y=yind + x=xind + if len(yval)>len(xval): + xdict=xval + ydict=yval + elif len(xval)> len(yval): + xdict=yval + ydict=xval + elif len(xval)==len(yval): + xdict=xval + ydict=yval + for i in xind: + xlist+=[xdict[i]] + for i in yind: + ylist+=[ydict[i]] + xlist.reverse() + ylist.reverse() + return (xlist,ylist) + + + +###main### +print "--------------vector one and two--------------------------" +ts1=np.array(df1['value']) +ts2=np.array(df2['value']) + +##print "--------------original index for vector one and two--------------------------------------" +ts1index= df1.index +print "-----------------" +print ts1index + +ts2index=df2.index +print ts2index +#### +print "--------------------distance matrix---------------------" +dis=distances(ts1,ts2) +###print dis +## +print "----------------list of diagonals ---------------------------------------------" +arr,index=lstdiagonal(dis) +startindex=index +zeroarr=arr +#print startindex +#print zeroarr + +print "---------------diagonal index with zero---------------------------------------" +diaindex,diaval= diagonalzero(dis,ts1,ts2,startindex) +diavals=diaval +diaindexes=diaindex +#print diavals +#print diaindexes + +runs=zero_runs(diavals) + +print "---list of indexes---" +#print runs +print "----maximum count of zero-------------------------------------" +sumzero,i=countzero(runs) +sumindex=i +maxruns=runs[sumindex] +#print maxruns +#print "***********" +#print sumindex + +zeroruns=sumzero +#print zeroruns +###### +diazero=zeroindex(maxruns,diaindexes,sumindex) +#### +#print diazero +xindex2,yindex2,ts1value,ts2value=dupindex(ts1,ts2,diazero) +###### +xindex=xindex2 +yindex=yindex2 +duplicatedts1=ts1value +duplicatedts2=ts2value +##print xindex +##print yindex +print duplicatedts1 +##print"-----------------------------------------------------------------------------" +print duplicatedts2 +#### +import matplotlib.pyplot as plt +plt.subplot(2, 1, 1) +plt.plot(duplicatedts1,'r-') +plt.ylabel('upid_3c41703c2d3a8b97f479afdb6ccf799f cbg') +plt.subplot(2, 1, 2) +plt.plot(duplicatedts2) +plt.ylabel('upid_3fc32e5ad912a8ea7efced9151804bdb cbg') + +plt.show() +print "-----------dictionary for original index--------------------------------------------------" +originalts1=lookupdict(ts1index) +originalts2=lookupdict(ts2index) +print originalts1 +print originalts2 + + +print "--------------match to original index-------------------------" +print original(originalts1,originalts2,xindex,yindex) + From 611c7e6f480057d83d810ae941a5eba619bd42a5 Mon Sep 17 00:00:00 2001 From: Amenzeo <34592606+Amenzeo@users.noreply.github.com> Date: Sun, 12 Aug 2018 02:56:23 -0700 Subject: [PATCH 4/9] Delete deduplication_distance.py --- deduplication_distance.py | 339 -------------------------------------- 1 file changed, 339 deletions(-) delete mode 100644 deduplication_distance.py diff --git a/deduplication_distance.py b/deduplication_distance.py deleted file mode 100644 index b4369392..00000000 --- a/deduplication_distance.py +++ /dev/null @@ -1,339 +0,0 @@ -import numpy as np -import pandas as pd - -""" -Data Deduplication of Continous blood glucose. Final output are indexes of duplications, duplicated values and a plot of the values -""" -#Section one -""" -Test data comparing two uploadid's -files:0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv - 0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv -""" -##data = pd.read_csv("C:/Python27/test2.csv",delimiter=',') -##df1=data.loc[data['alp'] =='x', ['value']] -##df2=data.loc[data['alp'] =='y', ['value']] - - -data = pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/data.csv",delimiter=',') -##df1 = data.loc[data['uploadId'] =='upid_3c41703c2d3a8b97f479afdb6ccf799f', ['utcTime','value']] -##df2 = data.loc[data['uploadId'] =='upid_3fc32e5ad912a8ea7efced9151804bdb', ['utcTime','value']] - - -##df1 = data.loc[data['uploadId'] =='upid_17db2d2a0ae0e02a12c0a5067e5fe85b', ['utcTime','value']] -##df2 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']] - - -df1 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']] -df2 = data.loc[data['uploadId'] =='upid_830c6de3e2ecbbec6fbad0cecc64bdf5', ['utcTime','value']] - - - -##data=pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c2.csv",delimiter=',') -##df1 = data.loc[data['uploadId'] =='2f61322480c841fd8679fe81e94930b2', ['utcTime','value']] -##df2 = data.loc[data['uploadId'] =='c05970591b404518a1cbd64595d628e5', ['utcTime','value']] - - - - -def distances(x,y): - """ - input: - x & y: vectors of cbg values - - output - distance matrix of x and y - """ - if len(y)>len(x): - leny=len(y) - lenx=len(x) - xval=x - yval=y - elif len(x)>len(y): - leny=len(x) - lenx=len(y) - xval=y - yval=x - elif len(y)==len(x): - lenx=len(x) - leny=len(y) - xval=x - yval=y - distances= [[0] * lenx for i in range(leny)] - for i in range(leny): - for j in range(lenx): - distances[i][j] = ((xval[j])-(yval[i]))**2 - return distances - - -def lstdiagonal(dis): - """ - input - dis: distance matrix - output:generate a list of diagonals - a:the diagonal array with max sum of zeroes. - w: start index - """ - matrix=np.array(dis) - j=-len(dis) - x=len(dis[0])+1 - longest_match=0 - for i in range(len(dis[0])-1,j,-1): - arr=matrix.diagonal(i) - nbr_zero=(arr == 0).sum() - if nbr_zero >= longest_match: - longest_match = nbr_zero - a=arr - w=abs(i) - return (a,w) - - - - - -def diagonalzero(df,ts1,ts2,startindex): - ''' - df: distance matrix - ts1:vector 1 - ts2:vector 2 - startindex: the start index for the diagonal - ouput: returns all values and index in diagonals with max zeroes - ''' - diaindex=[] - diaval=[] - if len(ts2)>len(ts1): - leny=len(ts2) - j=len(ts1)-1 - elif len(ts1)>len(ts2): - leny=len(ts1) - j=len(ts2)-1 - elif len(ts1)==len(ts2): - lenx=len(ts1) - leny=len(ts2) - j=len(ts1)-1 - i=startindex - k=0 - while i=0: - diaindex+=[[i,k]] - diaval+=[df[i][k]] - k=k+1 - i=i+1 - return (diaindex,diaval) - - - -def zero_runs(a): #https://stackoverflow.com/questions/24885092/finding-the-consecutive-zeros-in-a-numpy-array - """ - a: array output from diagonal zero - output: - Return the consecutive zero in the array - """ - iszero = np.concatenate(([0], np.equal(a, 0).view(np.int8), [0])) - absdiff = np.abs(np.diff(iszero)) - ranges = np.where(absdiff == 1)[0].reshape(-1, 2) - return ranges - - -def countzero(runs): - """ - runs: list of start and stop index of the consecutive zeros in an array - output:returns the start and stop index with max zeroes - ind: the index of the result within runs list - - """ - maxcount=0 - for i in range(len(runs)): - x=runs[i][1]-runs[i][0] - if x>=maxcount: - maxcount=x - ind=i - count=maxcount - return count,ind - - -def zeroindex(runs,dia,runindex): - ''' - dia:diagonal indexes returned from function diagonalzeros - runs:start and stop indexes - output"accumulate the indexes of consecutive zero - the index for the longest zeros" - ''' - i=runs[0] - j=runs[1] - indexlst=[] - for i in range(i,j): - indexlst+=[dia[i]] - - return indexlst - - - -def dupindex(x,y,diagonalzero): - """ - x & y : vectors - diagonalzero:Output from function zeroindex - output - line upindexes for vector x and y and the duplicate values - """ - yvalue=[] - xvalue=[] - xdup=[] - ydup=[] - if len(y)>len(x): - yval=y - xval=x - elif len(x)>len(y): - xval=y - yval=x - elif len(y)==len(x): - xval=x - yval=y - for i in range(len(diagonalzero)): - yvalue+=[diagonalzero[i][0]] - yindex=yvalue - for i in range(len(diagonalzero)): - xvalue+=[diagonalzero[i][1]] - xindex=xvalue - for i in range(len(xindex)): - val=xindex[i] - xdup+=[xval[val]] - ts1dup=xdup - ts1dup.reverse() - for i in range(len(yindex)): - val=yindex[i] - ydup+=[yval[val]] - ts2dup=ydup - ts2dup.reverse() - return(xindex,yindex,ts1dup,ts2dup) - -def lookupdict(x): - """ - x: vactor - output: a dictionary holding the original indexes of vector - """ - indexdict={} - for i in range(len(x)): - indexdict[i]=x[i] - return indexdict - - -##retrieve original index -def original(xval,yval,xind,yind): - """ - input: - xval & yval: dictionary from function lookupdict - xind $ yind: line up indexes - output: return a list of original duplicate indexes - """ - xlist=[] - ylist=[] - if len(yind) > len(xind): - y=yind - x=xind - elif len(xind)>len(xind): - y=xind - x=yind - xdict=x - elif len(xind)==len(yind): - y=yind - x=xind - if len(yval)>len(xval): - xdict=xval - ydict=yval - elif len(xval)> len(yval): - xdict=yval - ydict=xval - elif len(xval)==len(yval): - xdict=xval - ydict=yval - for i in xind: - xlist+=[xdict[i]] - for i in yind: - ylist+=[ydict[i]] - xlist.reverse() - ylist.reverse() - return (xlist,ylist) - - - -###main### -print "--------------vector one and two--------------------------" -ts1=np.array(df1['value']) -ts2=np.array(df2['value']) - -##print "--------------original index for vector one and two--------------------------------------" -ts1index= df1.index -print "-----------------" -print ts1index - -ts2index=df2.index -print ts2index -#### -print "--------------------distance matrix---------------------" -dis=distances(ts1,ts2) -###print dis -## -print "----------------list of diagonals ---------------------------------------------" -arr,index=lstdiagonal(dis) -startindex=index -zeroarr=arr -#print startindex -#print zeroarr - -print "---------------diagonal index with zero---------------------------------------" -diaindex,diaval= diagonalzero(dis,ts1,ts2,startindex) -diavals=diaval -diaindexes=diaindex -#print diavals -#print diaindexes - -runs=zero_runs(diavals) - -print "---list of indexes---" -#print runs -print "----maximum count of zero-------------------------------------" -sumzero,i=countzero(runs) -sumindex=i -maxruns=runs[sumindex] -#print maxruns -#print "***********" -#print sumindex - -zeroruns=sumzero -#print zeroruns -###### -diazero=zeroindex(maxruns,diaindexes,sumindex) -#### -#print diazero -xindex2,yindex2,ts1value,ts2value=dupindex(ts1,ts2,diazero) -###### -xindex=xindex2 -yindex=yindex2 -duplicatedts1=ts1value -duplicatedts2=ts2value -##print xindex -##print yindex -print duplicatedts1 -##print"-----------------------------------------------------------------------------" -print duplicatedts2 -#### -import matplotlib.pyplot as plt -plt.subplot(2, 1, 1) -plt.plot(duplicatedts1,'r-') -plt.ylabel('upid_3c41703c2d3a8b97f479afdb6ccf799f cbg') -plt.subplot(2, 1, 2) -plt.plot(duplicatedts2) -plt.ylabel('upid_3fc32e5ad912a8ea7efced9151804bdb cbg') - -plt.show() -print "-----------dictionary for original index--------------------------------------------------" -originalts1=lookupdict(ts1index) -originalts2=lookupdict(ts2index) -print originalts1 -print originalts2 - - -print "--------------match to original index-------------------------" -print original(originalts1,originalts2,xindex,yindex) - From 8fccbbbbb807a720e8f1d951fef4ae6e445325c5 Mon Sep 17 00:00:00 2001 From: Amenze Okpah Date: Sun, 12 Aug 2018 03:06:59 -0700 Subject: [PATCH 5/9] Create functions to find duplicated values between two vectors consecutive zero values through the diagonal use to obtain matching indexes. --- deduplication_distance.py | 356 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 356 insertions(+) create mode 100644 deduplication_distance.py diff --git a/deduplication_distance.py b/deduplication_distance.py new file mode 100644 index 00000000..7a36820b --- /dev/null +++ b/deduplication_distance.py @@ -0,0 +1,356 @@ +import numpy as np +import pandas as pd + +""" +Data Deduplication of Continous blood glucose. Final output are indexes of duplicated values, duplicated values and a plot of the values +""" + +""" +Test data from two files +files:0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv + 0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv +""" + +#Test data1 +##data = pd.read_csv("C:/Python27/test2.csv",delimiter=',') +##print data +##df1=data.loc[data['alp'] =='x', ['value']] +##df2=data.loc[data['alp'] =='y', ['value']] + +#Test data2 +#data = pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/data.csv",delimiter=',') +##df1 = data.loc[data['uploadId'] =='upid_3c41703c2d3a8b97f479afdb6ccf799f', ['utcTime','value']] +##df2 = data.loc[data['uploadId'] =='upid_3fc32e5ad912a8ea7efced9151804bdb', ['utcTime','value']] + + +#Test data3 +##df1 = data.loc[data['uploadId'] =='upid_17db2d2a0ae0e02a12c0a5067e5fe85b', ['utcTime','value']] +##df2 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']] + + +#Test data4 +##df1 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']] +##df2 = data.loc[data['uploadId'] =='upid_830c6de3e2ecbbec6fbad0cecc64bdf5', ['utcTime','value']] + + +#Test data5 +data=pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c2.csv",delimiter=',') +df1 = data.loc[data['uploadId'] =='2f61322480c841fd8679fe81e94930b2', ['utcTime','value']] +df2 = data.loc[data['uploadId'] =='c05970591b404518a1cbd64595d628e5', ['utcTime','value']] + + + + +def Distances(x,y): + """ + Compute the distance matrix for vector x and y + Args: + x:vector value for uploadid x + y:vector value for uploadid y + Returns: + distances: distance matrix of x and y + """ + if len(y) > len(x): + leny = len(y) + lenx = len(x) + xval = x + yval = y + elif len(x) > len(y): + leny = len(x) + lenx = len(y) + xval = y + yval = x + elif len(y) == len(x): + lenx = len(x) + leny = len(y) + xval = x + yval = y + distances= [[0] * lenx for i in range(leny)] + for i in range(leny): + for j in range(lenx): + distances[i][j] = ((xval[j])-(yval[i]))**2 + return distances + + +def DiagonalList(dis): + """ + Find the diagonal with the highest count of zero and the diagonal start index + Args: + dis: distance matrix + Returns: + diagonal: diagonal with higest count of zero + diagonalStartIndex:Start Index of diagonal + """ + matrix=np.array(dis) + j=-len(dis) + x=len(dis[0])+1 + highestCount=0 + for i in range(len(dis[0])-1,j,-1): + arr = matrix.diagonal(i) + countZero = (arr == 0).sum() + if countZero >= highestCount: + highestCount = countZero + diagonal=arr + diagonalStartIndex=abs(i) + return (diagonal,diagonalStartIndex) + + +def DiagonalZero(disMatrix,ts1,ts2,startindex): + """ + Compute the diagonal Index with the highest count of zero (output from DiagonalList) + Args: + disMatrix: distance matrix + ts1:Vector 1 + ts2:vector 2 + startindex: the start index for the diagonal + Returns: + dia.Index:diagonal Index + dia.value: diagonal value + """ + diaIndex=[] + diaValue=[] + if len(ts2) > len(ts1): + leny = len(ts2) + j = len(ts1)-1 + elif len(ts1) > len(ts2): + leny = len(ts1) + j = len(ts2)-1 + elif len(ts1) == len(ts2): + lenx = len(ts1) + leny = len(ts2) + j = len(ts1)-1 + i = startindex + k = 0 + while i < leny and k <= j: + if disMatrix[i][k] >= 0: + diaIndex += [[i,k]] + diaValue += [disMatrix[i][k]] + k = k+1 + i = i+1 + return (diaIndex,diaValue) + + + +def zero_runs(diaValue): #https://stackoverflow.com/questions/24885092/finding-the-consecutive-zeros-in-a-numpy-array + """ + Args: + diaValue: diagonal values returned from function diagonalzero + Returns: + ranges: list of consecutive zero ranges in the diagonal + """ + iszero = np.concatenate(([0], np.equal(diaValue, 0).view(np.int8), [0])) + absdiff = np.abs(np.diff(iszero)) + ranges = np.where(absdiff == 1)[0].reshape(-1, 2) + return ranges + + +def CountZero(runs): + """ + Args: + runs: list of start and stop index of the consecutive zeros in an array + Returns: + totalCount:returns the count of consecutive zero + countIndex:list index with max zeros + + """ + maxcount=0 + for i in range(len(runs)): + count=runs[i][1]-runs[i][0] + if count>=maxcount: + maxcount=count + countIndex=i + totalCount=maxcount + return (totalCount,countIndex) + + +def ZeroIndex(runs,dia,runindex): + """ + #accumulate indexes + Args: + dia:diagonal indexes returned from function DiagonalZero + runs:start and stop indexes + Results: + indexlst: list of indexes + """ + i=runs[0] + j=runs[1] + indexlst=[] + for i in range(i,j): + indexlst+=[dia[i]] + + return indexlst + + + +def DupIndex(x,y,indexzero): + """ + Args: + x & y : vectors + indexzero:Output from function zeroindex + Result: + xindex& yindex:matrix indexes for vector x and y + ts1dup &ts2dup: duplicated values + """ + yvalue=[] + xvalue=[] + xdup=[] + ydup=[] + if len(y)>len(x): + yval=y + xval=x + elif len(x)>len(y): + xval=y + yval=x + elif len(y)==len(x): + xval=x + yval=y + for i in range(len(indexzero)): + yvalue+=[indexzero[i][0]] + yindex=yvalue + for i in range(len(indexzero)): + xvalue+=[indexzero[i][1]] + xindex=xvalue + for i in range(len(xindex)): + val=xindex[i] + xdup+=[xval[val]] + ts1dup=xdup + ts1dup.reverse() + for i in range(len(yindex)): + val=yindex[i] + ydup+=[yval[val]] + ts2dup=ydup + ts2dup.reverse() + return(xindex,yindex,ts1dup,ts2dup) + +def lookupdict(x): + """ + Args: + x: vector + Returns: + indexDict: a dictionary holding the original indexes of vector + """ + indexDict={} + for i in range(len(x)): + indexDict[i]=x[i] + return indexDict + +def ExtracteIndex(xDict,yDict,xIndex,yIndex): + """ + Args: + xDict & yDict: dictionary from function lookupdict + xIndex $ yIndex: line up indexes + Returns: + xIndexList,yIndexList: a list of original duplicate indexes + """ + xIndexList=[] + yIndexList=[] + if len(yIndex) > len(xIndex): + y=yIndex + x=xIndex + elif len(xIndex)>len(yIndex): + y=xIndex + x=yIndex + # x.Dict=x + elif len(xIndex)==len(yIndex): + y=yIndex + x=xIndex + if len(yDict)>len(xDict): + xdict=xDict + ydict=yDict + elif len(xDict)> len(yDict): + xdict=yDict + ydict=xDict + elif len(xDict)==len(yDict): + xdict=xDict + ydict=yDict + for i in xIndex: + xIndexList+=[xdict[i]] + for i in yIndex: + yIndexList+=[ydict[i]] + xIndexList.reverse() + yIndexList.reverse() + return (xIndexList,yIndexList) + + + +###main### +print "--------------vector one and two--------------------------" +ts1=np.array(df1['value']) +ts2=np.array(df2['value']) + +print "--------------original index for vector one and two--------------------------------------" +ts1index= df1.index +print "-----------------" +#print ts1index + +ts2index=df2.index +#print ts2index +#### +print "--------------------distance matrix---------------------" +dis=Distances(ts1,ts2) +#print dis +## +print "----------------list of diagonals ---------------------------------------------" +arr,index=DiagonalList(dis) +startindex=index +zeroarr=arr +#print startindex +#print zeroarr + +print "---------------diagonal index with zero---------------------------------------" +diaindex,diaval= DiagonalZero(dis,ts1,ts2,startindex) +diavals=diaval +diaindexes=diaindex +#print diavals +#print diaindexes + +runs=zero_runs(diavals) + +print "---list of indexes---" +#print runs + +print "----maximum count of zero-------------------------------------" +sumzero,i=CountZero(runs) +sumindex=i +#print sumindex +maxruns=runs[sumindex] +#print maxruns +zeroruns=sumzero +#print zeroruns +print "*******************" +###### +diazero=ZeroIndex(maxruns,diaindexes,sumindex) +#### +#print diazero +xindex2,yindex2,ts1value,ts2value=DupIndex(ts1,ts2,diazero) +###### +xindex=xindex2 +yindex=yindex2 +duplicatedts1=ts1value +duplicatedts2=ts2value +#print xindex +#print yindex +print duplicatedts1 +print"-----------------------dup1------------------------------------------------------" +print duplicatedts2 + +print "-----------dictionary for original index--------------------------------------------------" +originalindexts1=lookupdict(ts1index) +originalindexts2=lookupdict(ts2index) +#print originalindexts1 +#print originalindexts2 + + +print "--------------match to original index-------------------------" +print ExtracteIndex(originalindexts1,originalindexts2,xindex,yindex) + +## +import matplotlib.pyplot as plt +plt.subplot(2, 1, 1) +plt.plot(duplicatedts1,'r-') +plt.ylabel('vector x') +plt.subplot(2, 1, 2) +plt.plot(duplicatedts2) +plt.ylabel('vector y') + +plt.show() From 62faa5571f3e63ec4caeaf2059f19a30aa23860d Mon Sep 17 00:00:00 2001 From: Amenzeo <34592606+Amenzeo@users.noreply.github.com> Date: Sun, 12 Aug 2018 03:43:06 -0700 Subject: [PATCH 6/9] Update deduplication_distance.py --- deduplication_distance.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/deduplication_distance.py b/deduplication_distance.py index 7a36820b..f2cb8154 100644 --- a/deduplication_distance.py +++ b/deduplication_distance.py @@ -1,46 +1,34 @@ import numpy as np import pandas as pd - """ Data Deduplication of Continous blood glucose. Final output are indexes of duplicated values, duplicated values and a plot of the values """ - """ Test data from two files files:0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv 0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv """ - #Test data1 ##data = pd.read_csv("C:/Python27/test2.csv",delimiter=',') ##print data ##df1=data.loc[data['alp'] =='x', ['value']] ##df2=data.loc[data['alp'] =='y', ['value']] - #Test data2 #data = pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/data.csv",delimiter=',') ##df1 = data.loc[data['uploadId'] =='upid_3c41703c2d3a8b97f479afdb6ccf799f', ['utcTime','value']] ##df2 = data.loc[data['uploadId'] =='upid_3fc32e5ad912a8ea7efced9151804bdb', ['utcTime','value']] - - #Test data3 ##df1 = data.loc[data['uploadId'] =='upid_17db2d2a0ae0e02a12c0a5067e5fe85b', ['utcTime','value']] ##df2 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']] - - #Test data4 ##df1 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']] ##df2 = data.loc[data['uploadId'] =='upid_830c6de3e2ecbbec6fbad0cecc64bdf5', ['utcTime','value']] - #Test data5 data=pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c2.csv",delimiter=',') df1 = data.loc[data['uploadId'] =='2f61322480c841fd8679fe81e94930b2', ['utcTime','value']] df2 = data.loc[data['uploadId'] =='c05970591b404518a1cbd64595d628e5', ['utcTime','value']] - - - def Distances(x,y): """ Compute the distance matrix for vector x and y @@ -71,7 +59,6 @@ def Distances(x,y): distances[i][j] = ((xval[j])-(yval[i]))**2 return distances - def DiagonalList(dis): """ Find the diagonal with the highest count of zero and the diagonal start index @@ -94,7 +81,6 @@ def DiagonalList(dis): diagonalStartIndex=abs(i) return (diagonal,diagonalStartIndex) - def DiagonalZero(disMatrix,ts1,ts2,startindex): """ Compute the diagonal Index with the highest count of zero (output from DiagonalList) @@ -129,8 +115,6 @@ def DiagonalZero(disMatrix,ts1,ts2,startindex): i = i+1 return (diaIndex,diaValue) - - def zero_runs(diaValue): #https://stackoverflow.com/questions/24885092/finding-the-consecutive-zeros-in-a-numpy-array """ Args: @@ -142,8 +126,6 @@ def zero_runs(diaValue): #https://stackoverflow.com/questions/24885092/finding-t absdiff = np.abs(np.diff(iszero)) ranges = np.where(absdiff == 1)[0].reshape(-1, 2) return ranges - - def CountZero(runs): """ Args: @@ -151,7 +133,6 @@ def CountZero(runs): Returns: totalCount:returns the count of consecutive zero countIndex:list index with max zeros - """ maxcount=0 for i in range(len(runs)): @@ -180,8 +161,6 @@ def ZeroIndex(runs,dia,runindex): return indexlst - - def DupIndex(x,y,indexzero): """ Args: @@ -272,7 +251,6 @@ def ExtracteIndex(xDict,yDict,xIndex,yIndex): return (xIndexList,yIndexList) - ###main### print "--------------vector one and two--------------------------" ts1=np.array(df1['value']) From ce70580d21d9ba7b9bd2421b83bd945358e5bd32 Mon Sep 17 00:00:00 2001 From: Amenzeo <34592606+Amenzeo@users.noreply.github.com> Date: Tue, 14 Aug 2018 11:10:56 -0700 Subject: [PATCH 7/9] Update deduplication_distance.py --- deduplication_distance.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/deduplication_distance.py b/deduplication_distance.py index f2cb8154..40ddffd3 100644 --- a/deduplication_distance.py +++ b/deduplication_distance.py @@ -249,8 +249,9 @@ def ExtracteIndex(xDict,yDict,xIndex,yIndex): xIndexList.reverse() yIndexList.reverse() return (xIndexList,yIndexList) +import time +start_time = time.time() - ###main### print "--------------vector one and two--------------------------" ts1=np.array(df1['value']) @@ -322,6 +323,8 @@ def ExtracteIndex(xDict,yDict,xIndex,yIndex): print "--------------match to original index-------------------------" print ExtracteIndex(originalindexts1,originalindexts2,xindex,yindex) +print time.time() - start_time, "seconds" + ## import matplotlib.pyplot as plt plt.subplot(2, 1, 1) From 73bf8256ffc06ba01c5fae125e425f4587fc3ee1 Mon Sep 17 00:00:00 2001 From: Amenze Okpah Date: Wed, 15 Aug 2018 14:05:55 -0700 Subject: [PATCH 8/9] Documentation and test run --- deduplication.Rmd | 386 +++++++++++++++++----------------------------- 1 file changed, 143 insertions(+), 243 deletions(-) diff --git a/deduplication.Rmd b/deduplication.Rmd index c25706e1..159c9729 100644 --- a/deduplication.Rmd +++ b/deduplication.Rmd @@ -1,89 +1,129 @@ --- -title: "uploadid" -author: "Amenze" +title: "De-duplication" +author: "Amenze Okpah" date: "June 28, 2018" output: html_document --- -```{r} -setwd("/Users/Amenze/Desktop/tidepool/refdata") -``` - - -```{r } -#library(data.table) -library(ggplot2) -library(plyr) -library(dplyr) -#using jmotif -library(jmotif) -#install.packages("RecordLinkage") -library(RecordLinkage) -#install.packages("PTXQC") -library("PTXQC") -library(stringr) -library(zoo) - - - -``` - - - +compare similarity between two numeric vectors using Symbolic Aggregate approXimation (SAX). +Symbolic Aggregate approXimation (SAX) algorithm application to the input time series transforms its into a strings. +The algoithm was proposed by Lin et al.) and extends the PAA-based approach inheriting the original algorithm simplicity and low computational complexity while providing satisfactory sensitivity and selectivity in range query processing. Moreover, the use of a symbolic representation opened a door to the existing wealth of data-structures and string-manipulation algorithms in computer science such as hashing, regular expression, pattern matching, suffix trees, and grammatical inference. +#http://www.cs.ucr.edu/~eamonn/SAX.htm +#https://jmotif.github.io/sax-vsm_site/morea/algorithm/PAA.html ```{r} - # create a list from these files -list.filenames<-list.files("/Users/Amenze/Desktop/tidepool/refdata",pattern=".csv$") - - +dir <- choose.dir(default = "", caption = "Select folder") +list.filenames<-list.files(dir,pattern=".csv$") #extract files based on duplicated utctime for (i in 1:length(list.filenames)) { patient<-read.csv(list.filenames[i]) patient_cbg<-subset(patient,patient$type=="cbg") if ((length(unique(patient_cbg$utcTime)))!=nrow(patient_cbg)) - write.csv(patient_cbg,paste0("/Users/Amenze/Desktop/tidepool/refdata/duplicated",list.filenames[i])) - + write.csv(patient_cbg,paste0(dir,list.filenames[i])) } ``` - - - - - + # Coverts two vectors to their string equivalent + # Args: + # x:vector of numeric values for uploadid1 + # y:vector of numeric values for uploadid2 + # alpha.Size: alphabet size(number of character that will be used to discretized numeric values) + # Returns: + # longeststring:longest common substring match between discretized valuea of x and y respectively stringX,stringY + # percentage similarity between vector x and y + +#Note: since PAA takes normalized values,vectors are normalized using the mean and standard deviation of either vector (x or y) +#PAA:the length of PAA values are fixed to length of the vectors to avoid reducing the dimensions since all values are needed to check for duplication. ```{r} -##read in file and check for duplicated utc - -field<-c("deviceId","id","uploadId","utcTime","type","value") -patient<-read.csv("duplicated0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv")[,field] -patient$mgvalue<-patient$value*18.01559 -length(unique(patient$uploadId)) #1319 -length(unique(patient$id))==nrow(patient) ##True -length(unique(patient$utcTime))#72287 -length(unique(patient$utcTime))==nrow(patient) #False - - +StringConvert <- function(x, y, alpha.Size){ + if (length(x) != length(y)){ + normvalue = x + } + else if(length(x) == length(y)){ + normvalue = x + } + normvalue.Mean <- mean(normvalue) + normvalue.Dev <- sd(normvalue) + xnormalized <- (x - normvalue.Mean) / normvalue.Dev + ynormalized <- (y - normvalue.Mean) / normvalue.Dev + X.PAA = paa(xnormalized, length(x)) + y.PAA = paa(ynormalized, length(y)) + xString.Value <- series_to_string(X.PAA, alpha.Size) + yString.Value <- series_to_string(y.PAA, alpha.Size) + stringX <- xString.Value + stringY <- yString.Value + longeststring<-LCSn(c(stringX,stringY)) + return (list(longeststring, stringX, stringY, (levenshteinSim(xString.Value, yString.Value)))) +} +``` +#computes the range of matching character between string values and common subsequence + Args: + values:output from stringcnvert function + Returns: + Loc:Matching Ranges +```{r} +IndexRange<-function(values){ + strings.Values <- as.character(c(values[2],values[3])) + common.SubSequence <- as.character(values[1]) + loc <- str_locate(strings.Values, common.SubSequence) + return(list(loc)) +} +``` +##computes the duplicated values for each vectors and their indexes + # Args: + # x & y: vectors + # Returns: + # duplicated.X:duplicated value for vextor x + # duplicated.Y:duplicated value for vextor y + # index.X:duplicated values index for vector x + # index.Y:duplicated values index for vector y +```{r} +ExtractDuplicateIndex <- function(x, y, IndexRange){ + duplicate.X <- x[IndexRange[[1]][1,][1] : IndexRange[[1]][1,][2], "value"] + duplicate.Y <- y[IndexRange[[1]][2,][1] : IndexRange[[1]][2,][2], "value"] + index.X <- row.names(x[IndexRange[[1]][1,][1] : IndexRange[[1]][1,][2],]) + index.Y <- row.names(y[IndexRange[[1]][2,][1] : IndexRange[[1]][2,][2],]) +return(list(duplicate.X, duplicate.Y, index.X, index.Y)) +} +``` +##computes a list of indexes + Args: + x: vectors + Returns: + duplicate.Index:duplicated value index +```{r} +ExtractVectorIndex<-function(x){ + for (i in 1:length(x)) + indexnum <- c(x[[i]]) + duplicate.Index<-as.numeric(indexnum) +return(duplicate.Index) +} +``` +#import files and subset data based on uploadIds to test function +```{r} +##read in file and check for duplicated utcTime +field <- c("deviceId", "id", "uploadId", "utcTime", "type", "value") +patient <- read.csv("FileName ")[,field] #0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv +patient$mgvalue <- patient$value*18.01559 + +#check for unique uploadIds +uniqueid <- function(df){ + for (i in df["uploadId"]){ + Id <- unique(df$utcTime) + return (Id) + } +} ###subset patient file based on 5 uploadids -df1<-subset(patient,patient$uploadId=="upid_3c41703c2d3a8b97f479afdb6ccf799f") - -df2<-subset(patient,patient$uploadId=="upid_3fc32e5ad912a8ea7efced9151804bdb") - - -df3<-subset(patient,patient$uploadId=="upid_17db2d2a0ae0e02a12c0a5067e5fe85b") - - -df4<-subset(patient,patient$uploadId=="upid_5fad608cf32bd03a1cd56e3bb1fdb834") - - - -df5<-subset(patient,patient$uploadId=="upid_830c6de3e2ecbbec6fbad0cecc64bdf5") +df1 <- subset(patient, patient$uploadId == "upid_3c41703c2d3a8b97f479afdb6ccf799f") +df2 <- subset(patient, patient$uploadId == "upid_3fc32e5ad912a8ea7efced9151804bdb") +df3 <- subset(patient, patient$uploadId == "upid_17db2d2a0ae0e02a12c0a5067e5fe85b") +df4 <- subset(patient, patient$uploadId == "upid_5fad608cf32bd03a1cd56e3bb1fdb834") +df5 <- subset(patient, patient$uploadId == "upid_830c6de3e2ecbbec6fbad0cecc64bdf5") #plot values for each uploadid -par(mfrow=c(3,3)) - - +par(mfrow=c(3, 3)) plot.ts(df1$value) plot.ts(df2$value) plot.ts(df3$value) @@ -92,205 +132,65 @@ plot.ts(df5$value) ``` - - ```{r} -## input -#x:vector value for uploadid x -#y: vector value for uploadid y -#outputs -#longeststring:longest common substring match between discretize value of x and y respectively p1 & p2 -#percentage similarity -#alphaxy:the alphabet size - -stringcnvert<-function(x,y,alphaxy){ - if (length(x)!=length(y)){ - normvalue=x - } - else if(length(x)==length(y)){ - normvalue=x - } - normvalue.mean <- mean(normvalue) - normvalue.dev<-sd(normvalue) - xznorm<-(x - normvalue.mean)/normvalue.dev - yznorm<-(y - normvalue.mean)/normvalue.dev - y_paa1 = paa(xznorm,length(x)) ##we decided to use the exact length of the vector based on what we want to achieve - y_paa2 = paa(yznorm,length(y)) - xstringvalue<-series_to_string(y_paa1, alphaxy) - ystringvalue<-series_to_string(y_paa2, alphaxy) - p1<-xstringvalue - p2<-ystringvalue - longeststring<-LCSn(c(p1,p2)) - return (list(longeststring,p1,p2,(levenshteinSim(xstringvalue,ystringvalue)))) - - -} - - - - -##input -#values:output from stringcnvert function -##ouptputs -#stringvalue: 2 strings compared -#substringrep:matching substring that was compared, -#stringdetect:boolean eqivalent of string detected(TRUE:if detected, FALSE:not detected) -#stringcount:count of substrng in stringvalue -#loc:starting and ending indexes for substring in the two strings compared. - -duplicateindex<-function(values){ - stringsvalues <- as.character(c(values[2],values[3])) - substringrep <- as.character(values[1]) - stringdetect<-str_detect(stringsvalues,substringrep) - stringcount<-str_count(stringsvalues, substringrep) - loc <- str_locate(stringsvalues,substringrep ) - return(list(stringsvalues,substringrep,stringdetect,stringcount,loc)) -} - - - -##input -#x & y:dataframe -#strngindex: output from duplicateindex function(loc) -##ouptputs -#s1:numeric vector , -#s2:numeric vector, -#index1:indexes of s1, -#index2:indexes of s2 -# -extractindex<-function(x,y,strngindex){ - - s1<-x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],"value"] - s2<-y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],"value"] - indexs1<-row.names(x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],]) - indexs2<-row.names(y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],]) - - return(list(s1,s2,indexs1,indexs2)) - -} - - -###input -#x:a list of indexes -##ouptputs -#indexnum1:numeric equivalent of indexes - -extractvalue<-function(x){ - for (i in 1:length(x)[[1]]) - indexnum <- c(x[[i]]) - indexnum1<-as.numeric(indexnum) -return(indexnum1) -} - -##main -# -values<-stringcnvert(df1$mgvalue,df2$mgvalue,8) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. -strngindex<-duplicateindex(values) -validate<-extractindex(df1,df2,strngindex) -seq1<-extractvalue(validate[3]) -seq2<-extractvalue(validate[4]) - -df1_mgvalue<-patient[seq1,"mgvalue"] -df2_mgvalue<-patient[seq2,"mgvalue"] -compare<-data.frame(df1_mgvalue,df2_mgvalue) - -# par(mfrow=c(1,2)) -# plot.ts(compare$df1_mgvalue) -# plot.ts(compare$df2_mgvalue) - -plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), +system.time(values <- StringConvert(df1$mgvalue, df2$mgvalue,3)) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. +system.time(strngindex <- IndexRange(values)) +validate <- ExtractDuplicateIndex(df1, df2, strngindex) +seq1 <- ExtractVectorIndex(validate[3]) +seq2 <- ExtractVectorIndex(validate[4]) +df1_mgvalue <- patient[seq1,"value"] +df2_mgvalue <- patient[seq2,"value"] +compare <- data.frame(df1_mgvalue,df2_mgvalue) + +#plot duplicated values +plot.zoo(cbind(compare$df1_mgvalue, compare$df2_mgvalue), plot.type = "multiple", col = c("red", "blue")) - - -sum(compare$df1_mgvalue-compare$df2_mgvalue) - - +sum(compare$df1_mgvalue - compare$df2_mgvalue) ``` - - ```{r} -values<-stringcnvert(df3$mgvalue,df4$mgvalue,3) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. ##3 worked better for this run -strngindex<-duplicateindex(values) -validate<-extractindex(df3,df4,strngindex) -seq1<-extractvalue(validate[3]) -seq2<-extractvalue(validate[4]) - -df1_mgvalue<-patient[seq1,"mgvalue"] -df2_mgvalue<-patient[seq2,"mgvalue"] -compare<-data.frame(df1_mgvalue,df2_mgvalue) - -# par(mfrow=c(1,2)) -# plot.ts(compare$df1_mgvalue) -# plot.ts(compare$df2_mgvalue) - -plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), +values <- StringConvert(df3$mgvalue, df4$mgvalue,3 ) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. ##3 worked better for this run +strngindex <- IndexRange(values) +validate <- ExtractDuplicateIndex(df3,df4,strngindex) +seq1 <- ExtractVectorIndex(validate[3]) +seq2 <- ExtractVectorIndex(validate[4]) + +df1_mgvalue <- patient[seq1,"value"] +df2_mgvalue <- patient[seq2,"value"] +compare <- data.frame(df1_mgvalue, df2_mgvalue) +plot.zoo(cbind(compare$df1_mgvalue, compare$df2_mgvalue), plot.type = "multiple", col = c("red", "blue")) - - -sum(compare$df1_mgvalue-compare$df2_mgvalue) +sum(compare$df1_mgvalue - compare$df2_mgvalue) ``` - - - ```{r} -patient<-read.csv("duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv")[,field] -patient$mgvalue<-patient$value*18.01559 -length(unique(patient$uploadId)) #2 -length(unique(patient$id))==nrow(patient) ##True -length(unique(patient$utcTime))# 8899 -anyDuplicated(patient$utcTime) #4366 -length(unique(patient$utcTime))==nrow(patient) #False - - - +patient <- read.csv("filename")[,field] #0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv +patient$mgvalue <- patient$value*18.01559 +length(unique(patient$uploadId)) +length(unique(patient$id))==nrow(patient) +length(unique(patient$utcTime)) +anyDuplicated(patient$utcTime) +length(unique(patient$utcTime))==nrow(patient) df1<-subset(patient,patient$uploadId=="2f61322480c841fd8679fe81e94930b2") - - df2<-subset(patient,patient$uploadId=="c05970591b404518a1cbd64595d628e5") - - - ``` - ```{r} -values<-stringcnvert(df1$mgvalue,df2$mgvalue,8) -strngindex<-duplicateindex(values) -validate<-extractindex(df1,df2,strngindex) -seq1<-extractvalue(validate[3]) -seq2<-extractvalue(validate[4]) - -df1_mgvalue<-patient[seq1,"mgvalue"] -df2_mgvalue<-patient[seq2,"mgvalue"] +values <- StringConvert(df1$mgvalue,df2$mgvalue,3) +strngindex <- IndexRange(values) +validate <- ExtractDuplicateIndex(df1,df2,strngindex) +seq1 <- ExtractVectorIndex(validate[3]) +seq2 <- ExtractVectorIndex(validate[4]) + +df1_mgvalue<-patient[seq1,"value"] +df2_mgvalue<-patient[seq2,"value"] compare<-data.frame(df1_mgvalue,df2_mgvalue) - - plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), plot.type = "multiple", col = c("red", "blue")) - sum(compare$df1_mgvalue-compare$df2_mgvalue) - ``` - - - - - - - - - - - - - - - -#cross correlation From 6ac33e872e44f9c4cf802d06d9c7e7541ed9acbb Mon Sep 17 00:00:00 2001 From: Amenze Okpah Date: Wed, 15 Aug 2018 14:06:52 -0700 Subject: [PATCH 9/9] Data Deduplication functions --- deduplicationFunction.R | 72 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 deduplicationFunction.R diff --git a/deduplicationFunction.R b/deduplicationFunction.R new file mode 100644 index 00000000..1f5a0e48 --- /dev/null +++ b/deduplicationFunction.R @@ -0,0 +1,72 @@ +StringConvert <- function(x, y, alpha.Size){ + # + # Coverts two vectors string equivalent + # Args: + # x:vector value for uploadid x + # y:vector value for uploadid y + # alpha.Size: Alphabet size. + # Returns: + # longeststring:longest common substring match between discretized value of x and y respectively stringX,stringY + # percentage similarity between vector x and y + if (length(x) != length(y)){ + normvalue = x + } + else if(length(x) == length(y)){ + normvalue = x + } + normvalue.Mean <- mean(normvalue) + normvalue.Dev <- sd(normvalue) + xnormalized <- (x - normvalue.Mean) / normvalue.Dev + ynormalized <- (y - normvalue.Mean) / normvalue.Dev + X.PAA = paa(xnormalized, length(x)) + y.PAA = paa(ynormalized, length(y)) + xString.Value <- series_to_string(X.PAA, alpha.Size) + yString.Value <- series_to_string(y.PAA, alpha.Size) + stringX <- xString.Value + stringY <- yString.Value + longeststring<-LCSn(c(stringX,stringY)) + return (list(longeststring, stringX, stringY, (levenshteinSim(xString.Value, yString.Value)))) +} + + +IndexRange<- function(values){ + #computes the range of matching character between string values and common subsequence + # Args: + # values:output from stringcnvert function + # Returns: + # Loc:Matching Ranges + strings.Values <- as.character(c(values[2],values[3])) + common.SubSequence <- as.character(values[1]) + loc <- str_locate(strings.Values, common.SubSequence) + return(list(loc)) +} + + +ExtractDuplicateIndex <- function(x, y, IndexRange){ + ##computes the duplicated values for each vectors and their indexes + # Args: + # x & y: vectors + # Returns: + # duplicated.X:duplicated value for vextor x + # duplicated.Y:duplicated value for vextor y + # index.X:duplicated values index for vector x + # index.Y:duplicated values index for vector y + duplicate.X <- x[IndexRange[[1]][1,][1] : IndexRange[[1]][1,][2], "value"] + duplicate.Y <- y[IndexRange[[1]][2,][1] : IndexRange[[1]][2,][2], "value"] + index.X <- row.names(x[IndexRange[[1]][1,][1] : IndexRange[[1]][1,][2],]) + index.Y <- row.names(y[IndexRange[[1]][2,][1] : IndexRange[[1]][2,][2],]) + return(list(duplicate.X, duplicate.Y, index.X, index.Y)) +} + + +ExtractVectorIndex<- function(x){ + ##computes a list of indexes + # Args: + # x: vectors + # Returns: + # duplicate.Index:duplicated value index + for (i in 1:length(x)) + indexnum <- c(x[[i]]) + duplicate.Index<-as.numeric(indexnum) + return(duplicate.Index) +} \ No newline at end of file