From 659a938e16c0700dcfb64b92d841316f77eade96 Mon Sep 17 00:00:00 2001
From: Amenze Okpah <eaokpah@dons.usfca.edu>
Date: Sat, 14 Jul 2018 21:22:07 -0700
Subject: [PATCH 1/9] depulication function first Scenario

---
 deduplication.Rmd | 296 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 296 insertions(+)
 create mode 100644 deduplication.Rmd

diff --git a/deduplication.Rmd b/deduplication.Rmd
new file mode 100644
index 00000000..c25706e1
--- /dev/null
+++ b/deduplication.Rmd
@@ -0,0 +1,296 @@
+---
+title: "uploadid"
+author: "Amenze"
+date: "June 28, 2018"
+output: html_document
+---
+
+```{r}
+setwd("/Users/Amenze/Desktop/tidepool/refdata")
+```
+
+
+```{r }
+#library(data.table)
+library(ggplot2)
+library(plyr)
+library(dplyr)
+#using jmotif
+library(jmotif)
+#install.packages("RecordLinkage")
+library(RecordLinkage)
+#install.packages("PTXQC")
+library("PTXQC")
+library(stringr)
+library(zoo)
+
+
+
+```
+
+
+
+
+```{r}
+
+# create a list from these files
+list.filenames<-list.files("/Users/Amenze/Desktop/tidepool/refdata",pattern=".csv$")
+
+
+#extract files based on duplicated utctime
+for (i in 1:length(list.filenames))
+{
+  patient<-read.csv(list.filenames[i])
+  patient_cbg<-subset(patient,patient$type=="cbg")
+  if ((length(unique(patient_cbg$utcTime)))!=nrow(patient_cbg))
+      write.csv(patient_cbg,paste0("/Users/Amenze/Desktop/tidepool/refdata/duplicated",list.filenames[i]))
+
+ }
+```
+
+
+
+
+
+```{r}
+##read in file and check for duplicated utc
+
+field<-c("deviceId","id","uploadId","utcTime","type","value")
+patient<-read.csv("duplicated0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv")[,field]
+patient$mgvalue<-patient$value*18.01559
+length(unique(patient$uploadId)) #1319
+length(unique(patient$id))==nrow(patient) ##True
+length(unique(patient$utcTime))#72287
+length(unique(patient$utcTime))==nrow(patient) #False
+
+
+
+###subset patient file based on  5 uploadids
+df1<-subset(patient,patient$uploadId=="upid_3c41703c2d3a8b97f479afdb6ccf799f")
+
+df2<-subset(patient,patient$uploadId=="upid_3fc32e5ad912a8ea7efced9151804bdb")
+
+
+df3<-subset(patient,patient$uploadId=="upid_17db2d2a0ae0e02a12c0a5067e5fe85b")
+
+
+df4<-subset(patient,patient$uploadId=="upid_5fad608cf32bd03a1cd56e3bb1fdb834")
+
+
+
+df5<-subset(patient,patient$uploadId=="upid_830c6de3e2ecbbec6fbad0cecc64bdf5")
+
+#plot values for each uploadid
+par(mfrow=c(3,3))
+
+
+plot.ts(df1$value)
+plot.ts(df2$value)
+plot.ts(df3$value)
+plot.ts(df4$value)
+plot.ts(df5$value)
+
+```
+
+
+
+```{r}
+## input
+#x:vector value for uploadid x 
+#y: vector value for uploadid y
+#outputs 
+#longeststring:longest common substring match between discretize value of x and y respectively p1 & p2
+#percentage similarity
+#alphaxy:the alphabet size
+
+stringcnvert<-function(x,y,alphaxy){
+  if (length(x)!=length(y)){
+    normvalue=x
+  }
+  else if(length(x)==length(y)){
+    normvalue=x
+  }
+  normvalue.mean <- mean(normvalue)
+  normvalue.dev<-sd(normvalue)
+  xznorm<-(x - normvalue.mean)/normvalue.dev
+  yznorm<-(y - normvalue.mean)/normvalue.dev
+  y_paa1 = paa(xznorm,length(x)) ##we decided to use the exact length of the vector based on what we want to achieve
+  y_paa2 = paa(yznorm,length(y))
+  xstringvalue<-series_to_string(y_paa1, alphaxy)
+  ystringvalue<-series_to_string(y_paa2, alphaxy) 
+  p1<-xstringvalue
+  p2<-ystringvalue
+  longeststring<-LCSn(c(p1,p2))
+  return (list(longeststring,p1,p2,(levenshteinSim(xstringvalue,ystringvalue))))
+
+
+}
+
+
+
+
+##input
+#values:output from stringcnvert function
+##ouptputs
+#stringvalue: 2 strings compared 
+#substringrep:matching substring that was compared, 
+#stringdetect:boolean eqivalent of string detected(TRUE:if detected, FALSE:not detected)
+#stringcount:count of substrng in stringvalue 
+#loc:starting and ending indexes for substring in the two strings compared.
+
+duplicateindex<-function(values){
+  stringsvalues <- as.character(c(values[2],values[3]))
+  substringrep <- as.character(values[1])
+  stringdetect<-str_detect(stringsvalues,substringrep)
+  stringcount<-str_count(stringsvalues, substringrep)
+  loc <- str_locate(stringsvalues,substringrep )
+  return(list(stringsvalues,substringrep,stringdetect,stringcount,loc))
+}
+
+
+
+##input
+#x & y:dataframe
+#strngindex: output from  duplicateindex function(loc)
+##ouptputs
+#s1:numeric vector ,
+#s2:numeric vector, 
+#index1:indexes of s1, 
+#index2:indexes of s2
+#
+extractindex<-function(x,y,strngindex){
+  
+   s1<-x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],"value"]
+   s2<-y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],"value"]
+   indexs1<-row.names(x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],])
+   indexs2<-row.names(y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],])
+
+  return(list(s1,s2,indexs1,indexs2))
+
+}
+
+
+###input
+#x:a list of indexes
+##ouptputs
+#indexnum1:numeric equivalent of indexes
+
+extractvalue<-function(x){
+  for (i in 1:length(x)[[1]])
+  indexnum <- c(x[[i]]) 
+  indexnum1<-as.numeric(indexnum)
+return(indexnum1)
+}
+
+##main
+#
+values<-stringcnvert(df1$mgvalue,df2$mgvalue,8) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better.
+strngindex<-duplicateindex(values)
+validate<-extractindex(df1,df2,strngindex)
+seq1<-extractvalue(validate[3])
+seq2<-extractvalue(validate[4])
+
+df1_mgvalue<-patient[seq1,"mgvalue"]
+df2_mgvalue<-patient[seq2,"mgvalue"]
+compare<-data.frame(df1_mgvalue,df2_mgvalue)
+
+# par(mfrow=c(1,2))
+# plot.ts(compare$df1_mgvalue)
+# plot.ts(compare$df2_mgvalue)
+
+plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), 
+         plot.type = "multiple", 
+         col = c("red", "blue"))
+
+
+sum(compare$df1_mgvalue-compare$df2_mgvalue)
+
+
+```
+
+
+
+```{r}
+values<-stringcnvert(df3$mgvalue,df4$mgvalue,3) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. ##3 worked better for this run
+strngindex<-duplicateindex(values)
+validate<-extractindex(df3,df4,strngindex)
+seq1<-extractvalue(validate[3])
+seq2<-extractvalue(validate[4])
+
+df1_mgvalue<-patient[seq1,"mgvalue"]
+df2_mgvalue<-patient[seq2,"mgvalue"]
+compare<-data.frame(df1_mgvalue,df2_mgvalue)
+
+# par(mfrow=c(1,2))
+# plot.ts(compare$df1_mgvalue)
+# plot.ts(compare$df2_mgvalue)
+
+plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), 
+         plot.type = "multiple", 
+         col = c("red", "blue"))
+
+
+sum(compare$df1_mgvalue-compare$df2_mgvalue)
+```
+
+
+
+
+```{r}
+patient<-read.csv("duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv")[,field]
+patient$mgvalue<-patient$value*18.01559
+length(unique(patient$uploadId)) #2
+length(unique(patient$id))==nrow(patient) ##True
+length(unique(patient$utcTime))# 8899
+anyDuplicated(patient$utcTime) #4366
+length(unique(patient$utcTime))==nrow(patient) #False
+
+
+
+df1<-subset(patient,patient$uploadId=="2f61322480c841fd8679fe81e94930b2")
+
+
+df2<-subset(patient,patient$uploadId=="c05970591b404518a1cbd64595d628e5")
+  
+
+
+```
+
+
+```{r}
+values<-stringcnvert(df1$mgvalue,df2$mgvalue,8)
+strngindex<-duplicateindex(values)
+validate<-extractindex(df1,df2,strngindex)
+seq1<-extractvalue(validate[3])
+seq2<-extractvalue(validate[4])
+
+df1_mgvalue<-patient[seq1,"mgvalue"]
+df2_mgvalue<-patient[seq2,"mgvalue"]
+compare<-data.frame(df1_mgvalue,df2_mgvalue)
+
+
+
+plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), 
+         plot.type = "multiple", 
+         col = c("red", "blue"))
+
+sum(compare$df1_mgvalue-compare$df2_mgvalue)
+
+
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#cross correlation

From bb99788b0a10bc667df614ab99e5a5047788f75a Mon Sep 17 00:00:00 2001
From: Amenze Okpah <eaokpah@dons.usfca.edu>
Date: Tue, 17 Jul 2018 10:46:00 -0700
Subject: [PATCH 2/9] Cross correlation

---
 crosscorrelation.Rmd | 384 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 384 insertions(+)
 create mode 100644 crosscorrelation.Rmd

diff --git a/crosscorrelation.Rmd b/crosscorrelation.Rmd
new file mode 100644
index 00000000..c5d296a7
--- /dev/null
+++ b/crosscorrelation.Rmd
@@ -0,0 +1,384 @@
+---
+title: "uploadid"
+author: "Amenze"
+date: "June 28, 2018"
+output: html_document
+---
+
+```{r}
+setwd("/Users/Amenze/Desktop/tidepool/refdata")
+```
+
+
+```{r }
+#library(data.table)
+library(ggplot2)
+library(plyr)
+library(dplyr)
+#using jmotif
+library(jmotif)
+#install.packages("RecordLinkage")
+library(RecordLinkage)
+#install.packages("PTXQC")
+library("PTXQC")
+library(stringr)
+library(zoo)
+
+
+
+```
+
+
+
+
+```{r}
+
+# create a list from these files
+list.filenames<-list.files("/Users/Amenze/Desktop/tidepool/refdata",pattern=".csv$")
+
+
+#extract files based on duplicated utctime
+for (i in 1:length(list.filenames))
+{
+  patient<-read.csv(list.filenames[i])
+  patient_cbg<-subset(patient,patient$type=="cbg")
+  if ((length(unique(patient_cbg$utcTime)))!=nrow(patient_cbg))
+      write.csv(patient_cbg,paste0("/Users/Amenze/Desktop/tidepool/refdata/duplicated",list.filenames[i]))
+
+ }
+```
+
+
+
+
+
+```{r}
+##read in file and check for duplicated utc
+
+field<-c("deviceId","id","uploadId","utcTime","type","value")
+patient<-read.csv("duplicated0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv")[,field]
+patient$mgvalue<-patient$value*18.01559
+length(unique(patient$uploadId)) #1319
+length(unique(patient$id))==nrow(patient) ##True
+length(unique(patient$utcTime))#72287
+length(unique(patient$utcTime))==nrow(patient) #False
+
+
+
+###subset patient file based on  5 uploadids
+df1<-subset(patient,patient$uploadId=="upid_3c41703c2d3a8b97f479afdb6ccf799f")
+
+df2<-subset(patient,patient$uploadId=="upid_3fc32e5ad912a8ea7efced9151804bdb")
+
+
+df3<-subset(patient,patient$uploadId=="upid_17db2d2a0ae0e02a12c0a5067e5fe85b")
+
+
+df4<-subset(patient,patient$uploadId=="upid_5fad608cf32bd03a1cd56e3bb1fdb834")
+
+
+
+df5<-subset(patient,patient$uploadId=="upid_830c6de3e2ecbbec6fbad0cecc64bdf5")
+
+#plot values for each uploadid
+par(mfrow=c(3,3))
+
+
+plot.ts(df1$value)
+plot.ts(df2$value)
+plot.ts(df3$value)
+plot.ts(df4$value)
+plot.ts(df5$value)
+
+```
+
+
+
+```{r}
+## input
+#x:vector value for uploadid x 
+#y: vector value for uploadid y
+#outputs 
+#longeststring:longest common substring match between discretize value of x and y respectively p1 & p2
+#percentage similarity
+#alphaxy:the alphabet size
+
+stringcnvert<-function(x,y,alphaxy){
+  if (length(x)!=length(y)){
+    normvalue=x
+  }
+  else if(length(x)==length(y)){
+    normvalue=x
+  }
+  normvalue.mean <- mean(normvalue)
+  normvalue.dev<-sd(normvalue)
+  xznorm<-(x - normvalue.mean)/normvalue.dev
+  yznorm<-(y - normvalue.mean)/normvalue.dev
+  y_paa1 = paa(xznorm,length(x)) ##we decided to use the exact length of the vector based on what we want to achieve
+  y_paa2 = paa(yznorm,length(y))
+  xstringvalue<-series_to_string(y_paa1, alphaxy)
+  ystringvalue<-series_to_string(y_paa2, alphaxy) 
+  p1<-xstringvalue
+  p2<-ystringvalue
+  longeststring<-LCSn(c(p1,p2))
+  return (list(longeststring,p1,p2,(levenshteinSim(xstringvalue,ystringvalue))))
+
+
+}
+
+
+
+
+##input
+#values:output from stringcnvert function
+##ouptputs
+#stringvalue: 2 strings compared 
+#substringrep:matching substring that was compared, 
+#stringdetect:boolean eqivalent of string detected(TRUE:if detected, FALSE:not detected)
+#stringcount:count of substrng in stringvalue 
+#loc:starting and ending indexes for substring in the two strings compared.
+
+duplicateindex<-function(values){
+  stringsvalues <- as.character(c(values[2],values[3]))
+  substringrep <- as.character(values[1])
+  stringdetect<-str_detect(stringsvalues,substringrep)
+  stringcount<-str_count(stringsvalues, substringrep)
+  loc <- str_locate(stringsvalues,substringrep )
+  return(list(stringsvalues,substringrep,stringdetect,stringcount,loc))
+}
+
+
+
+##input
+#x & y:dataframe
+#strngindex: output from  duplicateindex function(loc)
+##ouptputs
+#s1:numeric vector ,
+#s2:numeric vector, 
+#index1:indexes of s1, 
+#index2:indexes of s2
+#
+extractindex<-function(x,y,strngindex){
+  
+   s1<-x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],"value"]
+   s2<-y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],"value"]
+   indexs1<-row.names(x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],])
+   indexs2<-row.names(y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],])
+
+  return(list(s1,s2,indexs1,indexs2))
+
+}
+
+
+###input
+#x:a list of indexes
+##ouptputs
+#indexnum1:numeric equivalent of indexes
+
+extractvalue<-function(x){
+  for (i in 1:length(x)[[1]])
+  indexnum <- c(x[[i]]) 
+  indexnum1<-as.numeric(indexnum)
+return(indexnum1)
+}
+
+##main
+#
+system.time(values<-stringcnvert(df1$mgvalue,df2$mgvalue,3))#http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better.
+#user  system elapsed 
+#   0.11    0.00    0.11 
+
+system.time(strngindex<-duplicateindex(values))
+validate<-extractindex(df1,df2,strngindex)
+seq1<-extractvalue(validate[3])
+seq2<-extractvalue(validate[4])
+
+df1_mgvalue<-patient[seq1,"mgvalue"]
+df2_mgvalue<-patient[seq2,"mgvalue"]
+compare<-data.frame(df1_mgvalue,df2_mgvalue)
+
+# par(mfrow=c(1,2))
+# plot.ts(compare$df1_mgvalue)
+# plot.ts(compare$df2_mgvalue)
+
+plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), 
+         plot.type = "multiple", 
+         col = c("red", "blue"))
+
+
+sum(compare$df1_mgvalue-compare$df2_mgvalue)
+
+
+```
+
+
+
+```{r}
+values<-stringcnvert(df3$mgvalue,df4$mgvalue,3 ) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. ##3 worked better for this run
+strngindex<-duplicateindex(values)
+validate<-extractindex(df3,df4,strngindex)
+seq1<-extractvalue(validate[3])
+seq2<-extractvalue(validate[4])
+
+df1_mgvalue<-patient[seq1,"mgvalue"]
+df2_mgvalue<-patient[seq2,"mgvalue"]
+compare<-data.frame(df1_mgvalue,df2_mgvalue)
+
+# par(mfrow=c(1,2))
+# plot.ts(compare$df1_mgvalue)
+# plot.ts(compare$df2_mgvalue)
+
+plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), 
+         plot.type = "multiple", 
+         col = c("red", "blue"))
+
+
+sum(compare$df1_mgvalue-compare$df2_mgvalue)
+```
+
+
+
+
+```{r}
+patient<-read.csv("duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv")[,field]
+patient$mgvalue<-patient$value*18.01559
+length(unique(patient$uploadId)) #2
+length(unique(patient$id))==nrow(patient) ##True
+length(unique(patient$utcTime))# 8899
+anyDuplicated(patient$utcTime) #4366
+length(unique(patient$utcTime))==nrow(patient) #False
+
+
+
+df1<-subset(patient,patient$uploadId=="2f61322480c841fd8679fe81e94930b2")
+
+
+df2<-subset(patient,patient$uploadId=="c05970591b404518a1cbd64595d628e5")
+  
+
+
+```
+
+
+```{r}
+values<-stringcnvert(df1$mgvalue,df2$mgvalue,8)
+strngindex<-duplicateindex(values)
+validate<-extractindex(df1,df2,strngindex)
+seq1<-extractvalue(validate[3])
+seq2<-extractvalue(validate[4])
+
+df1_mgvalue<-patient[seq1,"mgvalue"]
+df2_mgvalue<-patient[seq2,"mgvalue"]
+compare<-data.frame(df1_mgvalue,df2_mgvalue)
+
+
+
+plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), 
+         plot.type = "multiple", 
+         col = c("red", "blue"))
+
+sum(compare$df1_mgvalue-compare$df2_mgvalue)
+
+
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#cross correlation
+```{r}
+corre<-function(x,y,lagsize){
+  corr<-ccf(x$mgvalue,y$mgvalue,max.lag=lagsize,plot=TRUE)
+  max.value<-max(corr$acf)
+  max.lag<-corr$lag[which(corr$acf==max.value)]
+  xlength<-length(x$mgvalue)
+  ylength<-length(y$mgvalue)
+  if ((max.lag==0) & (xlength<ylength)) {
+    computelength<-xlength
+    xvalue<-x
+    yvalue<-y[max.lag:computelength,]
+  } 
+  else if ((max.lag==0)& (ylength<xlength)){
+    computelength=ylength
+    xvalue=x[1:computelength,]
+    yvalue=y
+  }
+  else if((max.lag==0) & (xlength==ylength)){
+    xvalue=x
+    yvalue=y
+  }
+  
+  if ((max.lag>0)& (xlength<ylength)){
+    startcomputelength=max.lag
+    endcomputelength<-xlength-max.lag
+     xvalue=x[max.lag:endcomputelength,]
+     yvalue=y[max.lag:endcomputelength,]
+   }
+   if ((max.lag>0)&(ylength<xlength)){
+     startcomputelength<-max.lag
+     endcomputelength<-ylength-max.lag
+     xvalue<-x[max.lag:endcomputelength,]
+     yvalue<-y[max.lag:endcomputelength,]
+   }
+   else if((max.lag>0) & (xlength==ylength)){
+    startcomputelength<-max.lag
+    xvalue=x[max.lag:length(x),]
+    yvalue=y[max.lag:length(y),]
+   }
+  if (max.lag<0){
+     startcomputelength<-abs(max.lag)
+     xendcomputelength<-xlength-startcomputelength
+     yendcomputelength<-ylength-startcomputelength
+     xvalue<-x[xendcomputelength:1,]
+     yvalue<-y[ylength:(startcomputelength+1),]
+  }
+  
+  return (list(df1=xvalue,df2=yvalue,max_correlation=max.value))
+  
+}
+ 
+system.time(corr<-corre(df1,df2,10))
+df1adj<-corr$df1
+df2adj<-corr$df2
+ 
+#ccf(df1$mgvalue,df2$mgvalue)
+
+
+##extracte the index
+
+t1indexvalue<-row.names(df1adj)
+t2indexvalue<-row.names(df2adj)
+ts1<-patient[t1indexvalue,"mgvalue"]
+ts2<-patient[t2indexvalue,"mgvalue"]
+
+plot.zoo(cbind(ts1,ts2), 
+         plot.type = "multiple", 
+         col = c("red", "blue"))
+sum(ts1-ts2)
+
+
+```
+
+
+```{r}
+corr<-corre(df3,df4,10)
+df1adj<-corr$df1
+df2adj<-corr$df2
+ 
+
+
+```
+
+
+

From 360d1dcb68e2d5cfe2bb56b0e4fbcbb71ed9b4a4 Mon Sep 17 00:00:00 2001
From: Amenze Okpah <eaokpah@dons.usfca.edu>
Date: Fri, 10 Aug 2018 01:15:31 -0700
Subject: [PATCH 3/9] use distance matrix to find longest consecutive duplicate

---
 deduplication_distance.py | 339 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 339 insertions(+)
 create mode 100644 deduplication_distance.py

diff --git a/deduplication_distance.py b/deduplication_distance.py
new file mode 100644
index 00000000..b4369392
--- /dev/null
+++ b/deduplication_distance.py
@@ -0,0 +1,339 @@
+import numpy as np
+import pandas as pd
+
+"""
+Data Deduplication of Continous blood glucose. Final output are indexes of duplications, duplicated values and a plot of the values
+"""
+#Section one
+"""
+Test data comparing two uploadid's
+files:0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv
+      0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv
+"""
+##data = pd.read_csv("C:/Python27/test2.csv",delimiter=',')
+##df1=data.loc[data['alp'] =='x', ['value']]
+##df2=data.loc[data['alp'] =='y', ['value']]
+
+
+data = pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/data.csv",delimiter=',')
+##df1 = data.loc[data['uploadId'] =='upid_3c41703c2d3a8b97f479afdb6ccf799f', ['utcTime','value']]
+##df2 = data.loc[data['uploadId'] =='upid_3fc32e5ad912a8ea7efced9151804bdb', ['utcTime','value']]
+
+
+##df1 = data.loc[data['uploadId'] =='upid_17db2d2a0ae0e02a12c0a5067e5fe85b', ['utcTime','value']]
+##df2 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']]
+
+
+df1 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']]
+df2 = data.loc[data['uploadId'] =='upid_830c6de3e2ecbbec6fbad0cecc64bdf5', ['utcTime','value']]
+
+
+
+##data=pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c2.csv",delimiter=',')
+##df1 = data.loc[data['uploadId'] =='2f61322480c841fd8679fe81e94930b2', ['utcTime','value']]
+##df2 = data.loc[data['uploadId'] =='c05970591b404518a1cbd64595d628e5', ['utcTime','value']]
+
+
+
+
+def distances(x,y):
+   """
+   input:
+   x & y: vectors of cbg values
+
+   output
+   distance matrix of x and y
+   """
+   if len(y)>len(x):
+      leny=len(y)
+      lenx=len(x)
+      xval=x
+      yval=y
+   elif len(x)>len(y):
+      leny=len(x)
+      lenx=len(y)
+      xval=y
+      yval=x
+   elif len(y)==len(x):
+      lenx=len(x)
+      leny=len(y)
+      xval=x
+      yval=y
+   distances= [[0] * lenx for i in range(leny)]
+   for i in range(leny):  
+     for j in range(lenx):   
+         distances[i][j] = ((xval[j])-(yval[i]))**2
+   return distances
+
+
+def lstdiagonal(dis):
+   """
+   input
+    dis: distance matrix
+   output:generate a list of diagonals
+     a:the diagonal array with max sum of zeroes.
+     w: start index
+   """
+   matrix=np.array(dis)
+   j=-len(dis)
+   x=len(dis[0])+1
+   longest_match=0
+   for i in range(len(dis[0])-1,j,-1): 
+       arr=matrix.diagonal(i)
+       nbr_zero=(arr == 0).sum()
+       if nbr_zero >= longest_match:
+          longest_match = nbr_zero
+          a=arr
+          w=abs(i)
+   return (a,w)
+
+
+
+
+
+def diagonalzero(df,ts1,ts2,startindex):
+   '''
+   df: distance matrix
+   ts1:vector 1
+   ts2:vector 2
+   startindex: the start index for the diagonal
+   ouput: returns all values  and index in diagonals with max zeroes
+   '''
+   diaindex=[]
+   diaval=[]
+   if len(ts2)>len(ts1):
+      leny=len(ts2)
+      j=len(ts1)-1
+   elif len(ts1)>len(ts2):
+      leny=len(ts1)
+      j=len(ts2)-1
+   elif len(ts1)==len(ts2):
+      lenx=len(ts1)
+      leny=len(ts2)
+      j=len(ts1)-1
+   i=startindex
+   k=0
+   while i<leny and k<=j:
+       if df[i][k]>=0:
+          diaindex+=[[i,k]]
+          diaval+=[df[i][k]]  
+          k=k+1
+          i=i+1  
+   return (diaindex,diaval)
+
+
+
+def zero_runs(a): #https://stackoverflow.com/questions/24885092/finding-the-consecutive-zeros-in-a-numpy-array
+   """
+   a: array output from diagonal zero
+   output:
+    Return the consecutive zero in the array
+   """
+   iszero = np.concatenate(([0], np.equal(a, 0).view(np.int8), [0]))
+   absdiff = np.abs(np.diff(iszero))
+   ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
+   return ranges
+
+
+def countzero(runs):
+   """
+    runs: list of start and stop index of the consecutive zeros in an array
+    output:returns the start and stop index with max zeroes
+    ind: the index of the result within runs list
+     
+   """
+   maxcount=0
+   for i in  range(len(runs)):
+        x=runs[i][1]-runs[i][0]
+        if x>=maxcount:
+            maxcount=x
+            ind=i
+   count=maxcount
+   return count,ind
+
+
+def zeroindex(runs,dia,runindex):
+    '''
+    dia:diagonal indexes returned from function diagonalzeros
+    runs:start and stop indexes
+    output"accumulate the indexes of consecutive zero
+    the index for the longest zeros"
+    '''
+    i=runs[0]
+    j=runs[1]
+    indexlst=[]
+    for i in range(i,j):
+       indexlst+=[dia[i]]
+
+    return indexlst
+
+
+
+def dupindex(x,y,diagonalzero):
+    """
+    x & y : vectors
+    diagonalzero:Output from function zeroindex
+     output
+     line upindexes for vector x and y and the duplicate values
+    """
+    yvalue=[]
+    xvalue=[]
+    xdup=[]
+    ydup=[]
+    if len(y)>len(x):
+       yval=y
+       xval=x
+    elif len(x)>len(y):
+      xval=y
+      yval=x
+    elif len(y)==len(x):
+      xval=x
+      yval=y
+    for i in range(len(diagonalzero)):
+       yvalue+=[diagonalzero[i][0]]
+    yindex=yvalue
+    for i in range(len(diagonalzero)):
+       xvalue+=[diagonalzero[i][1]]
+    xindex=xvalue
+    for i in range(len(xindex)):
+        val=xindex[i]
+        xdup+=[xval[val]] 
+    ts1dup=xdup
+    ts1dup.reverse()
+    for i in range(len(yindex)):
+        val=yindex[i]
+        ydup+=[yval[val]]
+    ts2dup=ydup
+    ts2dup.reverse()
+    return(xindex,yindex,ts1dup,ts2dup)
+   
+def lookupdict(x):
+   """
+   x: vactor
+   output: a dictionary holding the original indexes of vector
+   """
+   indexdict={}
+   for i in range(len(x)):
+       indexdict[i]=x[i]
+   return indexdict
+
+
+##retrieve original index
+def original(xval,yval,xind,yind):
+   """
+   input:
+   xval & yval: dictionary from function lookupdict
+   xind $ yind: line up indexes
+   output: return a list of original duplicate indexes
+   """
+   xlist=[]
+   ylist=[]
+   if len(yind) > len(xind):
+      y=yind
+      x=xind
+   elif len(xind)>len(xind):
+      y=xind
+      x=yind
+      xdict=x
+   elif len(xind)==len(yind):
+      y=yind
+      x=xind
+   if len(yval)>len(xval):
+      xdict=xval
+      ydict=yval
+   elif len(xval)> len(yval):
+       xdict=yval
+       ydict=xval
+   elif len(xval)==len(yval):
+      xdict=xval
+      ydict=yval
+   for i in xind:
+        xlist+=[xdict[i]]
+   for i in yind:
+        ylist+=[ydict[i]]
+   xlist.reverse()
+   ylist.reverse()
+   return (xlist,ylist)
+     
+
+
+###main###
+print "--------------vector one and two--------------------------"
+ts1=np.array(df1['value'])
+ts2=np.array(df2['value'])
+
+##print "--------------original index for vector one and two--------------------------------------"
+ts1index= df1.index
+print "-----------------"
+print ts1index
+ 
+ts2index=df2.index
+print ts2index
+####
+print "--------------------distance matrix---------------------"
+dis=distances(ts1,ts2)
+###print dis
+##
+print "----------------list of diagonals ---------------------------------------------"
+arr,index=lstdiagonal(dis)
+startindex=index
+zeroarr=arr
+#print startindex
+#print zeroarr
+      
+print "---------------diagonal index with zero---------------------------------------"
+diaindex,diaval= diagonalzero(dis,ts1,ts2,startindex)
+diavals=diaval
+diaindexes=diaindex
+#print diavals
+#print diaindexes
+
+runs=zero_runs(diavals)
+
+print "---list of indexes---"
+#print runs
+print "----maximum count of zero-------------------------------------"
+sumzero,i=countzero(runs)
+sumindex=i
+maxruns=runs[sumindex]
+#print maxruns
+#print "***********"
+#print sumindex
+
+zeroruns=sumzero
+#print zeroruns
+######
+diazero=zeroindex(maxruns,diaindexes,sumindex)
+####
+#print diazero
+xindex2,yindex2,ts1value,ts2value=dupindex(ts1,ts2,diazero)
+######
+xindex=xindex2
+yindex=yindex2
+duplicatedts1=ts1value
+duplicatedts2=ts2value
+##print xindex
+##print yindex
+print duplicatedts1
+##print"-----------------------------------------------------------------------------"
+print duplicatedts2
+####
+import matplotlib.pyplot as plt
+plt.subplot(2, 1, 1)
+plt.plot(duplicatedts1,'r-')
+plt.ylabel('upid_3c41703c2d3a8b97f479afdb6ccf799f cbg')
+plt.subplot(2, 1, 2)
+plt.plot(duplicatedts2)
+plt.ylabel('upid_3fc32e5ad912a8ea7efced9151804bdb cbg')
+
+plt.show()
+print "-----------dictionary for original index--------------------------------------------------"  
+originalts1=lookupdict(ts1index)
+originalts2=lookupdict(ts2index)
+print originalts1
+print originalts2
+
+
+print "--------------match to original index-------------------------"
+print original(originalts1,originalts2,xindex,yindex) 
+

From 611c7e6f480057d83d810ae941a5eba619bd42a5 Mon Sep 17 00:00:00 2001
From: Amenzeo <34592606+Amenzeo@users.noreply.github.com>
Date: Sun, 12 Aug 2018 02:56:23 -0700
Subject: [PATCH 4/9] Delete deduplication_distance.py

---
 deduplication_distance.py | 339 --------------------------------------
 1 file changed, 339 deletions(-)
 delete mode 100644 deduplication_distance.py

diff --git a/deduplication_distance.py b/deduplication_distance.py
deleted file mode 100644
index b4369392..00000000
--- a/deduplication_distance.py
+++ /dev/null
@@ -1,339 +0,0 @@
-import numpy as np
-import pandas as pd
-
-"""
-Data Deduplication of Continous blood glucose. Final output are indexes of duplications, duplicated values and a plot of the values
-"""
-#Section one
-"""
-Test data comparing two uploadid's
-files:0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv
-      0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv
-"""
-##data = pd.read_csv("C:/Python27/test2.csv",delimiter=',')
-##df1=data.loc[data['alp'] =='x', ['value']]
-##df2=data.loc[data['alp'] =='y', ['value']]
-
-
-data = pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/data.csv",delimiter=',')
-##df1 = data.loc[data['uploadId'] =='upid_3c41703c2d3a8b97f479afdb6ccf799f', ['utcTime','value']]
-##df2 = data.loc[data['uploadId'] =='upid_3fc32e5ad912a8ea7efced9151804bdb', ['utcTime','value']]
-
-
-##df1 = data.loc[data['uploadId'] =='upid_17db2d2a0ae0e02a12c0a5067e5fe85b', ['utcTime','value']]
-##df2 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']]
-
-
-df1 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']]
-df2 = data.loc[data['uploadId'] =='upid_830c6de3e2ecbbec6fbad0cecc64bdf5', ['utcTime','value']]
-
-
-
-##data=pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c2.csv",delimiter=',')
-##df1 = data.loc[data['uploadId'] =='2f61322480c841fd8679fe81e94930b2', ['utcTime','value']]
-##df2 = data.loc[data['uploadId'] =='c05970591b404518a1cbd64595d628e5', ['utcTime','value']]
-
-
-
-
-def distances(x,y):
-   """
-   input:
-   x & y: vectors of cbg values
-
-   output
-   distance matrix of x and y
-   """
-   if len(y)>len(x):
-      leny=len(y)
-      lenx=len(x)
-      xval=x
-      yval=y
-   elif len(x)>len(y):
-      leny=len(x)
-      lenx=len(y)
-      xval=y
-      yval=x
-   elif len(y)==len(x):
-      lenx=len(x)
-      leny=len(y)
-      xval=x
-      yval=y
-   distances= [[0] * lenx for i in range(leny)]
-   for i in range(leny):  
-     for j in range(lenx):   
-         distances[i][j] = ((xval[j])-(yval[i]))**2
-   return distances
-
-
-def lstdiagonal(dis):
-   """
-   input
-    dis: distance matrix
-   output:generate a list of diagonals
-     a:the diagonal array with max sum of zeroes.
-     w: start index
-   """
-   matrix=np.array(dis)
-   j=-len(dis)
-   x=len(dis[0])+1
-   longest_match=0
-   for i in range(len(dis[0])-1,j,-1): 
-       arr=matrix.diagonal(i)
-       nbr_zero=(arr == 0).sum()
-       if nbr_zero >= longest_match:
-          longest_match = nbr_zero
-          a=arr
-          w=abs(i)
-   return (a,w)
-
-
-
-
-
-def diagonalzero(df,ts1,ts2,startindex):
-   '''
-   df: distance matrix
-   ts1:vector 1
-   ts2:vector 2
-   startindex: the start index for the diagonal
-   ouput: returns all values  and index in diagonals with max zeroes
-   '''
-   diaindex=[]
-   diaval=[]
-   if len(ts2)>len(ts1):
-      leny=len(ts2)
-      j=len(ts1)-1
-   elif len(ts1)>len(ts2):
-      leny=len(ts1)
-      j=len(ts2)-1
-   elif len(ts1)==len(ts2):
-      lenx=len(ts1)
-      leny=len(ts2)
-      j=len(ts1)-1
-   i=startindex
-   k=0
-   while i<leny and k<=j:
-       if df[i][k]>=0:
-          diaindex+=[[i,k]]
-          diaval+=[df[i][k]]  
-          k=k+1
-          i=i+1  
-   return (diaindex,diaval)
-
-
-
-def zero_runs(a): #https://stackoverflow.com/questions/24885092/finding-the-consecutive-zeros-in-a-numpy-array
-   """
-   a: array output from diagonal zero
-   output:
-    Return the consecutive zero in the array
-   """
-   iszero = np.concatenate(([0], np.equal(a, 0).view(np.int8), [0]))
-   absdiff = np.abs(np.diff(iszero))
-   ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
-   return ranges
-
-
-def countzero(runs):
-   """
-    runs: list of start and stop index of the consecutive zeros in an array
-    output:returns the start and stop index with max zeroes
-    ind: the index of the result within runs list
-     
-   """
-   maxcount=0
-   for i in  range(len(runs)):
-        x=runs[i][1]-runs[i][0]
-        if x>=maxcount:
-            maxcount=x
-            ind=i
-   count=maxcount
-   return count,ind
-
-
-def zeroindex(runs,dia,runindex):
-    '''
-    dia:diagonal indexes returned from function diagonalzeros
-    runs:start and stop indexes
-    output"accumulate the indexes of consecutive zero
-    the index for the longest zeros"
-    '''
-    i=runs[0]
-    j=runs[1]
-    indexlst=[]
-    for i in range(i,j):
-       indexlst+=[dia[i]]
-
-    return indexlst
-
-
-
-def dupindex(x,y,diagonalzero):
-    """
-    x & y : vectors
-    diagonalzero:Output from function zeroindex
-     output
-     line upindexes for vector x and y and the duplicate values
-    """
-    yvalue=[]
-    xvalue=[]
-    xdup=[]
-    ydup=[]
-    if len(y)>len(x):
-       yval=y
-       xval=x
-    elif len(x)>len(y):
-      xval=y
-      yval=x
-    elif len(y)==len(x):
-      xval=x
-      yval=y
-    for i in range(len(diagonalzero)):
-       yvalue+=[diagonalzero[i][0]]
-    yindex=yvalue
-    for i in range(len(diagonalzero)):
-       xvalue+=[diagonalzero[i][1]]
-    xindex=xvalue
-    for i in range(len(xindex)):
-        val=xindex[i]
-        xdup+=[xval[val]] 
-    ts1dup=xdup
-    ts1dup.reverse()
-    for i in range(len(yindex)):
-        val=yindex[i]
-        ydup+=[yval[val]]
-    ts2dup=ydup
-    ts2dup.reverse()
-    return(xindex,yindex,ts1dup,ts2dup)
-   
-def lookupdict(x):
-   """
-   x: vactor
-   output: a dictionary holding the original indexes of vector
-   """
-   indexdict={}
-   for i in range(len(x)):
-       indexdict[i]=x[i]
-   return indexdict
-
-
-##retrieve original index
-def original(xval,yval,xind,yind):
-   """
-   input:
-   xval & yval: dictionary from function lookupdict
-   xind $ yind: line up indexes
-   output: return a list of original duplicate indexes
-   """
-   xlist=[]
-   ylist=[]
-   if len(yind) > len(xind):
-      y=yind
-      x=xind
-   elif len(xind)>len(xind):
-      y=xind
-      x=yind
-      xdict=x
-   elif len(xind)==len(yind):
-      y=yind
-      x=xind
-   if len(yval)>len(xval):
-      xdict=xval
-      ydict=yval
-   elif len(xval)> len(yval):
-       xdict=yval
-       ydict=xval
-   elif len(xval)==len(yval):
-      xdict=xval
-      ydict=yval
-   for i in xind:
-        xlist+=[xdict[i]]
-   for i in yind:
-        ylist+=[ydict[i]]
-   xlist.reverse()
-   ylist.reverse()
-   return (xlist,ylist)
-     
-
-
-###main###
-print "--------------vector one and two--------------------------"
-ts1=np.array(df1['value'])
-ts2=np.array(df2['value'])
-
-##print "--------------original index for vector one and two--------------------------------------"
-ts1index= df1.index
-print "-----------------"
-print ts1index
- 
-ts2index=df2.index
-print ts2index
-####
-print "--------------------distance matrix---------------------"
-dis=distances(ts1,ts2)
-###print dis
-##
-print "----------------list of diagonals ---------------------------------------------"
-arr,index=lstdiagonal(dis)
-startindex=index
-zeroarr=arr
-#print startindex
-#print zeroarr
-      
-print "---------------diagonal index with zero---------------------------------------"
-diaindex,diaval= diagonalzero(dis,ts1,ts2,startindex)
-diavals=diaval
-diaindexes=diaindex
-#print diavals
-#print diaindexes
-
-runs=zero_runs(diavals)
-
-print "---list of indexes---"
-#print runs
-print "----maximum count of zero-------------------------------------"
-sumzero,i=countzero(runs)
-sumindex=i
-maxruns=runs[sumindex]
-#print maxruns
-#print "***********"
-#print sumindex
-
-zeroruns=sumzero
-#print zeroruns
-######
-diazero=zeroindex(maxruns,diaindexes,sumindex)
-####
-#print diazero
-xindex2,yindex2,ts1value,ts2value=dupindex(ts1,ts2,diazero)
-######
-xindex=xindex2
-yindex=yindex2
-duplicatedts1=ts1value
-duplicatedts2=ts2value
-##print xindex
-##print yindex
-print duplicatedts1
-##print"-----------------------------------------------------------------------------"
-print duplicatedts2
-####
-import matplotlib.pyplot as plt
-plt.subplot(2, 1, 1)
-plt.plot(duplicatedts1,'r-')
-plt.ylabel('upid_3c41703c2d3a8b97f479afdb6ccf799f cbg')
-plt.subplot(2, 1, 2)
-plt.plot(duplicatedts2)
-plt.ylabel('upid_3fc32e5ad912a8ea7efced9151804bdb cbg')
-
-plt.show()
-print "-----------dictionary for original index--------------------------------------------------"  
-originalts1=lookupdict(ts1index)
-originalts2=lookupdict(ts2index)
-print originalts1
-print originalts2
-
-
-print "--------------match to original index-------------------------"
-print original(originalts1,originalts2,xindex,yindex) 
-

From 8fccbbbbb807a720e8f1d951fef4ae6e445325c5 Mon Sep 17 00:00:00 2001
From: Amenze Okpah <eaokpah@dons.usfca.edu>
Date: Sun, 12 Aug 2018 03:06:59 -0700
Subject: [PATCH 5/9] Create functions to find duplicated values between two
 vectors consecutive zero values through the diagonal use to obtain matching
 indexes.

---
 deduplication_distance.py | 356 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 356 insertions(+)
 create mode 100644 deduplication_distance.py

diff --git a/deduplication_distance.py b/deduplication_distance.py
new file mode 100644
index 00000000..7a36820b
--- /dev/null
+++ b/deduplication_distance.py
@@ -0,0 +1,356 @@
+import numpy as np
+import pandas as pd
+
+"""
+Data Deduplication of Continous blood glucose. Final output are indexes of duplicated values, duplicated values and a plot of the values
+"""
+
+"""
+Test data  from two files
+files:0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv
+      0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv
+"""
+
+#Test data1
+##data = pd.read_csv("C:/Python27/test2.csv",delimiter=',')
+##print data
+##df1=data.loc[data['alp'] =='x', ['value']]
+##df2=data.loc[data['alp'] =='y', ['value']]
+
+#Test data2
+#data = pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/data.csv",delimiter=',')
+##df1 = data.loc[data['uploadId'] =='upid_3c41703c2d3a8b97f479afdb6ccf799f', ['utcTime','value']]
+##df2 = data.loc[data['uploadId'] =='upid_3fc32e5ad912a8ea7efced9151804bdb', ['utcTime','value']]
+
+
+#Test data3
+##df1 = data.loc[data['uploadId'] =='upid_17db2d2a0ae0e02a12c0a5067e5fe85b', ['utcTime','value']]
+##df2 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']]
+
+
+#Test data4
+##df1 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']]
+##df2 = data.loc[data['uploadId'] =='upid_830c6de3e2ecbbec6fbad0cecc64bdf5', ['utcTime','value']]
+
+
+#Test data5
+data=pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c2.csv",delimiter=',')
+df1 = data.loc[data['uploadId'] =='2f61322480c841fd8679fe81e94930b2', ['utcTime','value']]
+df2 = data.loc[data['uploadId'] =='c05970591b404518a1cbd64595d628e5', ['utcTime','value']]
+
+
+
+
+def Distances(x,y):
+   """
+   Compute the distance matrix for vector x and y
+   Args:
+    x:vector value for uploadid x 
+    y:vector value for uploadid y
+   Returns:
+     distances: distance matrix of x and y
+   """
+   if len(y) > len(x):
+      leny = len(y)
+      lenx = len(x)
+      xval = x
+      yval = y
+   elif len(x) > len(y):
+      leny = len(x)
+      lenx = len(y)
+      xval = y
+      yval = x
+   elif len(y) == len(x):
+      lenx = len(x)
+      leny = len(y)
+      xval = x
+      yval = y
+   distances= [[0] * lenx for i in range(leny)]
+   for i in range(leny):  
+     for j in range(lenx):   
+         distances[i][j] = ((xval[j])-(yval[i]))**2
+   return distances
+
+
+def DiagonalList(dis):
+   """
+   Find the diagonal with the highest count of zero  and the diagonal start index
+   Args:
+    dis: distance matrix
+   Returns:
+     diagonal: diagonal with higest count of zero
+     diagonalStartIndex:Start Index of diagonal
+   """
+   matrix=np.array(dis)
+   j=-len(dis)
+   x=len(dis[0])+1
+   highestCount=0
+   for i in range(len(dis[0])-1,j,-1): 
+       arr = matrix.diagonal(i)
+       countZero = (arr == 0).sum()
+       if countZero >= highestCount:
+          highestCount = countZero
+          diagonal=arr
+          diagonalStartIndex=abs(i)
+   return (diagonal,diagonalStartIndex)
+
+
+def DiagonalZero(disMatrix,ts1,ts2,startindex):
+   """
+   Compute the diagonal Index with the highest count of zero (output from DiagonalList)
+   Args:
+    disMatrix: distance matrix
+    ts1:Vector 1
+    ts2:vector 2
+    startindex: the start index for the diagonal
+   Returns:
+     dia.Index:diagonal Index
+     dia.value: diagonal value
+   """
+   diaIndex=[]
+   diaValue=[]
+   if len(ts2) > len(ts1):
+      leny = len(ts2)
+      j = len(ts1)-1
+   elif len(ts1) > len(ts2):
+      leny = len(ts1)
+      j = len(ts2)-1
+   elif len(ts1) == len(ts2):
+      lenx = len(ts1)
+      leny = len(ts2)
+      j = len(ts1)-1
+   i = startindex
+   k = 0
+   while i < leny and k <= j:
+       if disMatrix[i][k] >= 0:
+          diaIndex += [[i,k]]
+          diaValue += [disMatrix[i][k]]  
+          k = k+1
+          i = i+1  
+   return (diaIndex,diaValue)
+
+
+
+def zero_runs(diaValue): #https://stackoverflow.com/questions/24885092/finding-the-consecutive-zeros-in-a-numpy-array
+   """
+   Args:
+   diaValue: diagonal values returned from function diagonalzero
+   Returns:
+    ranges: list of consecutive zero ranges in the diagonal
+   """
+   iszero = np.concatenate(([0], np.equal(diaValue, 0).view(np.int8), [0]))
+   absdiff = np.abs(np.diff(iszero))
+   ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
+   return ranges
+
+
+def CountZero(runs):
+   """
+   Args:
+    runs: list of start and stop index of the consecutive zeros in an array
+   Returns:
+    totalCount:returns the count of consecutive zero
+    countIndex:list index with max zeros
+     
+   """
+   maxcount=0
+   for i in  range(len(runs)):
+        count=runs[i][1]-runs[i][0]
+        if count>=maxcount:
+            maxcount=count
+            countIndex=i
+   totalCount=maxcount
+   return (totalCount,countIndex)
+
+
+def ZeroIndex(runs,dia,runindex):
+    """
+   #accumulate indexes  
+    Args:
+      dia:diagonal indexes returned from function DiagonalZero
+      runs:start and stop indexes
+   Results:
+     indexlst: list of indexes
+    """
+    i=runs[0]
+    j=runs[1]
+    indexlst=[]
+    for i in range(i,j):
+       indexlst+=[dia[i]]
+
+    return indexlst
+
+
+
+def DupIndex(x,y,indexzero):
+    """
+    Args:
+    x & y : vectors
+    indexzero:Output from function zeroindex
+    Result:
+     xindex& yindex:matrix indexes for vector x and y
+     ts1dup &ts2dup: duplicated values
+    """
+    yvalue=[]
+    xvalue=[]
+    xdup=[]
+    ydup=[]
+    if len(y)>len(x):
+       yval=y
+       xval=x
+    elif len(x)>len(y):
+      xval=y
+      yval=x
+    elif len(y)==len(x):
+      xval=x
+      yval=y
+    for i in range(len(indexzero)):
+       yvalue+=[indexzero[i][0]]
+    yindex=yvalue
+    for i in range(len(indexzero)):
+       xvalue+=[indexzero[i][1]]
+    xindex=xvalue
+    for i in range(len(xindex)):
+        val=xindex[i]
+        xdup+=[xval[val]] 
+    ts1dup=xdup
+    ts1dup.reverse()
+    for i in range(len(yindex)):
+        val=yindex[i]
+        ydup+=[yval[val]]
+    ts2dup=ydup
+    ts2dup.reverse()
+    return(xindex,yindex,ts1dup,ts2dup)
+   
+def lookupdict(x):
+   """
+   Args:
+     x: vector
+   Returns:
+     indexDict: a dictionary holding the original indexes of vector
+   """
+   indexDict={}
+   for i in range(len(x)):
+       indexDict[i]=x[i]
+   return indexDict
+
+def ExtracteIndex(xDict,yDict,xIndex,yIndex):
+   """
+   Args:
+      xDict & yDict: dictionary from function lookupdict
+      xIndex $ yIndex: line up indexes
+   Returns:
+    xIndexList,yIndexList: a list of original duplicate indexes
+   """
+   xIndexList=[]
+   yIndexList=[]
+   if len(yIndex) > len(xIndex):
+      y=yIndex
+      x=xIndex
+   elif len(xIndex)>len(yIndex):
+      y=xIndex
+      x=yIndex
+     # x.Dict=x
+   elif len(xIndex)==len(yIndex):
+      y=yIndex
+      x=xIndex
+   if len(yDict)>len(xDict):
+      xdict=xDict
+      ydict=yDict
+   elif len(xDict)> len(yDict):
+       xdict=yDict
+       ydict=xDict
+   elif len(xDict)==len(yDict):
+      xdict=xDict
+      ydict=yDict
+   for i in xIndex:
+        xIndexList+=[xdict[i]]
+   for i in yIndex:
+        yIndexList+=[ydict[i]]
+   xIndexList.reverse()
+   yIndexList.reverse()
+   return (xIndexList,yIndexList)
+     
+
+
+###main###
+print "--------------vector one and two--------------------------"
+ts1=np.array(df1['value'])
+ts2=np.array(df2['value'])
+
+print "--------------original index for vector one and two--------------------------------------"
+ts1index= df1.index
+print "-----------------"
+#print ts1index
+ 
+ts2index=df2.index
+#print ts2index
+####
+print "--------------------distance matrix---------------------"
+dis=Distances(ts1,ts2)
+#print dis
+##
+print "----------------list of diagonals ---------------------------------------------"
+arr,index=DiagonalList(dis)
+startindex=index
+zeroarr=arr
+#print startindex
+#print zeroarr
+      
+print "---------------diagonal index with zero---------------------------------------"
+diaindex,diaval= DiagonalZero(dis,ts1,ts2,startindex)
+diavals=diaval
+diaindexes=diaindex
+#print diavals
+#print diaindexes
+
+runs=zero_runs(diavals)
+
+print "---list of indexes---"
+#print runs
+
+print "----maximum count of zero-------------------------------------"
+sumzero,i=CountZero(runs)
+sumindex=i
+#print sumindex
+maxruns=runs[sumindex]
+#print maxruns
+zeroruns=sumzero
+#print zeroruns
+print "*******************"
+######
+diazero=ZeroIndex(maxruns,diaindexes,sumindex)
+####
+#print diazero
+xindex2,yindex2,ts1value,ts2value=DupIndex(ts1,ts2,diazero)
+######
+xindex=xindex2
+yindex=yindex2
+duplicatedts1=ts1value
+duplicatedts2=ts2value
+#print xindex
+#print yindex
+print duplicatedts1
+print"-----------------------dup1------------------------------------------------------"
+print duplicatedts2
+
+print "-----------dictionary for original index--------------------------------------------------"  
+originalindexts1=lookupdict(ts1index)
+originalindexts2=lookupdict(ts2index)
+#print originalindexts1
+#print originalindexts2
+
+
+print "--------------match to original index-------------------------"
+print ExtracteIndex(originalindexts1,originalindexts2,xindex,yindex) 
+
+##
+import matplotlib.pyplot as plt
+plt.subplot(2, 1, 1)
+plt.plot(duplicatedts1,'r-')
+plt.ylabel('vector x')
+plt.subplot(2, 1, 2)
+plt.plot(duplicatedts2)
+plt.ylabel('vector y')
+
+plt.show()

From 62faa5571f3e63ec4caeaf2059f19a30aa23860d Mon Sep 17 00:00:00 2001
From: Amenzeo <34592606+Amenzeo@users.noreply.github.com>
Date: Sun, 12 Aug 2018 03:43:06 -0700
Subject: [PATCH 6/9] Update deduplication_distance.py

---
 deduplication_distance.py | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/deduplication_distance.py b/deduplication_distance.py
index 7a36820b..f2cb8154 100644
--- a/deduplication_distance.py
+++ b/deduplication_distance.py
@@ -1,46 +1,34 @@
 import numpy as np
 import pandas as pd
-
 """
 Data Deduplication of Continous blood glucose. Final output are indexes of duplicated values, duplicated values and a plot of the values
 """
-
 """
 Test data  from two files
 files:0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv
       0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv
 """
-
 #Test data1
 ##data = pd.read_csv("C:/Python27/test2.csv",delimiter=',')
 ##print data
 ##df1=data.loc[data['alp'] =='x', ['value']]
 ##df2=data.loc[data['alp'] =='y', ['value']]
-
 #Test data2
 #data = pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/data.csv",delimiter=',')
 ##df1 = data.loc[data['uploadId'] =='upid_3c41703c2d3a8b97f479afdb6ccf799f', ['utcTime','value']]
 ##df2 = data.loc[data['uploadId'] =='upid_3fc32e5ad912a8ea7efced9151804bdb', ['utcTime','value']]
-
-
 #Test data3
 ##df1 = data.loc[data['uploadId'] =='upid_17db2d2a0ae0e02a12c0a5067e5fe85b', ['utcTime','value']]
 ##df2 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']]
-
-
 #Test data4
 ##df1 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']]
 ##df2 = data.loc[data['uploadId'] =='upid_830c6de3e2ecbbec6fbad0cecc64bdf5', ['utcTime','value']]
 
-
 #Test data5
 data=pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c2.csv",delimiter=',')
 df1 = data.loc[data['uploadId'] =='2f61322480c841fd8679fe81e94930b2', ['utcTime','value']]
 df2 = data.loc[data['uploadId'] =='c05970591b404518a1cbd64595d628e5', ['utcTime','value']]
 
-
-
-
 def Distances(x,y):
    """
    Compute the distance matrix for vector x and y
@@ -71,7 +59,6 @@ def Distances(x,y):
          distances[i][j] = ((xval[j])-(yval[i]))**2
    return distances
 
-
 def DiagonalList(dis):
    """
    Find the diagonal with the highest count of zero  and the diagonal start index
@@ -94,7 +81,6 @@ def DiagonalList(dis):
           diagonalStartIndex=abs(i)
    return (diagonal,diagonalStartIndex)
 
-
 def DiagonalZero(disMatrix,ts1,ts2,startindex):
    """
    Compute the diagonal Index with the highest count of zero (output from DiagonalList)
@@ -129,8 +115,6 @@ def DiagonalZero(disMatrix,ts1,ts2,startindex):
           i = i+1  
    return (diaIndex,diaValue)
 
-
-
 def zero_runs(diaValue): #https://stackoverflow.com/questions/24885092/finding-the-consecutive-zeros-in-a-numpy-array
    """
    Args:
@@ -142,8 +126,6 @@ def zero_runs(diaValue): #https://stackoverflow.com/questions/24885092/finding-t
    absdiff = np.abs(np.diff(iszero))
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    return ranges
-
-
 def CountZero(runs):
    """
    Args:
@@ -151,7 +133,6 @@ def CountZero(runs):
    Returns:
     totalCount:returns the count of consecutive zero
     countIndex:list index with max zeros
-     
    """
    maxcount=0
    for i in  range(len(runs)):
@@ -180,8 +161,6 @@ def ZeroIndex(runs,dia,runindex):
 
     return indexlst
 
-
-
 def DupIndex(x,y,indexzero):
     """
     Args:
@@ -272,7 +251,6 @@ def ExtracteIndex(xDict,yDict,xIndex,yIndex):
    return (xIndexList,yIndexList)
      
 
-
 ###main###
 print "--------------vector one and two--------------------------"
 ts1=np.array(df1['value'])

From ce70580d21d9ba7b9bd2421b83bd945358e5bd32 Mon Sep 17 00:00:00 2001
From: Amenzeo <34592606+Amenzeo@users.noreply.github.com>
Date: Tue, 14 Aug 2018 11:10:56 -0700
Subject: [PATCH 7/9] Update deduplication_distance.py

---
 deduplication_distance.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/deduplication_distance.py b/deduplication_distance.py
index f2cb8154..40ddffd3 100644
--- a/deduplication_distance.py
+++ b/deduplication_distance.py
@@ -249,8 +249,9 @@ def ExtracteIndex(xDict,yDict,xIndex,yIndex):
    xIndexList.reverse()
    yIndexList.reverse()
    return (xIndexList,yIndexList)
+import time
+start_time = time.time()
      
-
 ###main###
 print "--------------vector one and two--------------------------"
 ts1=np.array(df1['value'])
@@ -322,6 +323,8 @@ def ExtracteIndex(xDict,yDict,xIndex,yIndex):
 print "--------------match to original index-------------------------"
 print ExtracteIndex(originalindexts1,originalindexts2,xindex,yindex) 
 
+print time.time() - start_time, "seconds"
+
 ##
 import matplotlib.pyplot as plt
 plt.subplot(2, 1, 1)

From 73bf8256ffc06ba01c5fae125e425f4587fc3ee1 Mon Sep 17 00:00:00 2001
From: Amenze Okpah <eaokpah@dons.usfca.edu>
Date: Wed, 15 Aug 2018 14:05:55 -0700
Subject: [PATCH 8/9] Documentation and test run

---
 deduplication.Rmd | 386 +++++++++++++++++-----------------------------
 1 file changed, 143 insertions(+), 243 deletions(-)

diff --git a/deduplication.Rmd b/deduplication.Rmd
index c25706e1..159c9729 100644
--- a/deduplication.Rmd
+++ b/deduplication.Rmd
@@ -1,89 +1,129 @@
 ---
-title: "uploadid"
-author: "Amenze"
+title: "De-duplication"
+author: "Amenze Okpah"
 date: "June 28, 2018"
 output: html_document
 ---
 
-```{r}
-setwd("/Users/Amenze/Desktop/tidepool/refdata")
-```
-
-
-```{r }
-#library(data.table)
-library(ggplot2)
-library(plyr)
-library(dplyr)
-#using jmotif
-library(jmotif)
-#install.packages("RecordLinkage")
-library(RecordLinkage)
-#install.packages("PTXQC")
-library("PTXQC")
-library(stringr)
-library(zoo)
-
-
-
-```
-
-
-
+compare similarity between two numeric vectors using Symbolic Aggregate approXimation (SAX).
+Symbolic Aggregate approXimation (SAX) algorithm application to the input time series transforms its into a strings.
+The algoithm was proposed by Lin et al.) and extends the PAA-based approach inheriting the original algorithm simplicity and low computational complexity while providing satisfactory sensitivity and selectivity in range query processing. Moreover, the use of a symbolic representation opened a door to the existing wealth of data-structures and string-manipulation algorithms in computer science such as hashing, regular expression, pattern matching, suffix trees, and grammatical inference.
+#http://www.cs.ucr.edu/~eamonn/SAX.htm
+#https://jmotif.github.io/sax-vsm_site/morea/algorithm/PAA.html
 
 ```{r}
-
 # create a list from these files
-list.filenames<-list.files("/Users/Amenze/Desktop/tidepool/refdata",pattern=".csv$")
-
-
+dir <- choose.dir(default = "", caption = "Select folder")
+list.filenames<-list.files(dir,pattern=".csv$")
 #extract files based on duplicated utctime
 for (i in 1:length(list.filenames))
 {
   patient<-read.csv(list.filenames[i])
   patient_cbg<-subset(patient,patient$type=="cbg")
   if ((length(unique(patient_cbg$utcTime)))!=nrow(patient_cbg))
-      write.csv(patient_cbg,paste0("/Users/Amenze/Desktop/tidepool/refdata/duplicated",list.filenames[i]))
-
+      write.csv(patient_cbg,paste0(dir,list.filenames[i]))
  }
 ```
-
-
-
-
-
+  # Coverts two vectors to their string equivalent
+  # Args:
+  # x:vector of numeric values for uploadid1 
+  # y:vector of numeric values for uploadid2 
+  # alpha.Size: alphabet size(number of character that will be used to discretized numeric values)
+  # Returns:
+  #   longeststring:longest common substring match between discretized valuea of x and y respectively stringX,stringY
+  # percentage similarity between vector x and y
+
+#Note: since PAA takes normalized values,vectors are normalized using the mean and standard deviation of either vector (x or y)
+#PAA:the length of PAA values are fixed to length of the vectors to avoid reducing the dimensions since all values are needed to check for duplication.
 ```{r}
-##read in file and check for duplicated utc
-
-field<-c("deviceId","id","uploadId","utcTime","type","value")
-patient<-read.csv("duplicated0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv")[,field]
-patient$mgvalue<-patient$value*18.01559
-length(unique(patient$uploadId)) #1319
-length(unique(patient$id))==nrow(patient) ##True
-length(unique(patient$utcTime))#72287
-length(unique(patient$utcTime))==nrow(patient) #False
-
-
+StringConvert <- function(x, y, alpha.Size){
+  if (length(x) != length(y)){
+    normvalue = x
+  }
+  else if(length(x) == length(y)){
+    normvalue = x
+  }
+  normvalue.Mean <- mean(normvalue)
+  normvalue.Dev <- sd(normvalue)
+  xnormalized <- (x - normvalue.Mean) / normvalue.Dev
+  ynormalized <- (y - normvalue.Mean) / normvalue.Dev
+  X.PAA = paa(xnormalized, length(x)) 
+  y.PAA = paa(ynormalized,  length(y))
+  xString.Value <- series_to_string(X.PAA, alpha.Size)
+  yString.Value <- series_to_string(y.PAA, alpha.Size) 
+  stringX <- xString.Value
+  stringY <- yString.Value
+  longeststring<-LCSn(c(stringX,stringY))
+  return (list(longeststring, stringX, stringY, (levenshteinSim(xString.Value, yString.Value))))
+}
+```
+#computes the range of matching character between string values and common subsequence
+  Args:
+    values:output from stringcnvert function 
+  Returns:
+     Loc:Matching Ranges
+```{r}
+IndexRange<-function(values){
+  strings.Values <- as.character(c(values[2],values[3]))
+  common.SubSequence <- as.character(values[1])
+  loc <- str_locate(strings.Values, common.SubSequence)
+  return(list(loc))
+}
+```
+##computes the duplicated values for each vectors and their indexes 
+  # Args:
+  # x & y: vectors
+  # Returns:
+  #   duplicated.X:duplicated value for vextor x
+  # duplicated.Y:duplicated value for vextor y
+  # index.X:duplicated values index for vector x
+  # index.Y:duplicated values index for vector y
+```{r}
+ExtractDuplicateIndex <- function(x, y, IndexRange){
+   duplicate.X <- x[IndexRange[[1]][1,][1] : IndexRange[[1]][1,][2], "value"]
+   duplicate.Y <- y[IndexRange[[1]][2,][1] : IndexRange[[1]][2,][2], "value"]
+   index.X <- row.names(x[IndexRange[[1]][1,][1] : IndexRange[[1]][1,][2],])
+   index.Y <- row.names(y[IndexRange[[1]][2,][1] : IndexRange[[1]][2,][2],])
+return(list(duplicate.X, duplicate.Y, index.X, index.Y))
+}
+```
+##computes a list of indexes 
+  Args:
+    x: vectors
+  Returns:
+    duplicate.Index:duplicated value index
+```{r}
+ExtractVectorIndex<-function(x){
+  for (i in 1:length(x))
+  indexnum <- c(x[[i]]) 
+  duplicate.Index<-as.numeric(indexnum)
+return(duplicate.Index)
+}
+```
+#import files and subset data based on uploadIds to test  function
 
+```{r}
+##read in file and check for duplicated utcTime
+field <- c("deviceId", "id", "uploadId", "utcTime", "type", "value")
+patient <- read.csv("FileName ")[,field] #0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv
+patient$mgvalue <- patient$value*18.01559
+
+#check for unique uploadIds
+uniqueid <- function(df){
+   for (i in df["uploadId"]){
+     Id <- unique(df$utcTime)
+   return (Id)
+   }
+}
 ###subset patient file based on  5 uploadids
-df1<-subset(patient,patient$uploadId=="upid_3c41703c2d3a8b97f479afdb6ccf799f")
-
-df2<-subset(patient,patient$uploadId=="upid_3fc32e5ad912a8ea7efced9151804bdb")
-
-
-df3<-subset(patient,patient$uploadId=="upid_17db2d2a0ae0e02a12c0a5067e5fe85b")
-
-
-df4<-subset(patient,patient$uploadId=="upid_5fad608cf32bd03a1cd56e3bb1fdb834")
-
-
-
-df5<-subset(patient,patient$uploadId=="upid_830c6de3e2ecbbec6fbad0cecc64bdf5")
+df1 <- subset(patient, patient$uploadId == "upid_3c41703c2d3a8b97f479afdb6ccf799f")
+df2 <- subset(patient, patient$uploadId == "upid_3fc32e5ad912a8ea7efced9151804bdb")
+df3 <- subset(patient, patient$uploadId == "upid_17db2d2a0ae0e02a12c0a5067e5fe85b")
+df4 <- subset(patient, patient$uploadId == "upid_5fad608cf32bd03a1cd56e3bb1fdb834")
+df5 <- subset(patient, patient$uploadId == "upid_830c6de3e2ecbbec6fbad0cecc64bdf5")
 
 #plot values for each uploadid
-par(mfrow=c(3,3))
-
-
+par(mfrow=c(3, 3))
 plot.ts(df1$value)
 plot.ts(df2$value)
 plot.ts(df3$value)
@@ -92,205 +132,65 @@ plot.ts(df5$value)
 
 ```
 
-
-
 ```{r}
-## input
-#x:vector value for uploadid x 
-#y: vector value for uploadid y
-#outputs 
-#longeststring:longest common substring match between discretize value of x and y respectively p1 & p2
-#percentage similarity
-#alphaxy:the alphabet size
-
-stringcnvert<-function(x,y,alphaxy){
-  if (length(x)!=length(y)){
-    normvalue=x
-  }
-  else if(length(x)==length(y)){
-    normvalue=x
-  }
-  normvalue.mean <- mean(normvalue)
-  normvalue.dev<-sd(normvalue)
-  xznorm<-(x - normvalue.mean)/normvalue.dev
-  yznorm<-(y - normvalue.mean)/normvalue.dev
-  y_paa1 = paa(xznorm,length(x)) ##we decided to use the exact length of the vector based on what we want to achieve
-  y_paa2 = paa(yznorm,length(y))
-  xstringvalue<-series_to_string(y_paa1, alphaxy)
-  ystringvalue<-series_to_string(y_paa2, alphaxy) 
-  p1<-xstringvalue
-  p2<-ystringvalue
-  longeststring<-LCSn(c(p1,p2))
-  return (list(longeststring,p1,p2,(levenshteinSim(xstringvalue,ystringvalue))))
-
-
-}
-
-
-
-
-##input
-#values:output from stringcnvert function
-##ouptputs
-#stringvalue: 2 strings compared 
-#substringrep:matching substring that was compared, 
-#stringdetect:boolean eqivalent of string detected(TRUE:if detected, FALSE:not detected)
-#stringcount:count of substrng in stringvalue 
-#loc:starting and ending indexes for substring in the two strings compared.
-
-duplicateindex<-function(values){
-  stringsvalues <- as.character(c(values[2],values[3]))
-  substringrep <- as.character(values[1])
-  stringdetect<-str_detect(stringsvalues,substringrep)
-  stringcount<-str_count(stringsvalues, substringrep)
-  loc <- str_locate(stringsvalues,substringrep )
-  return(list(stringsvalues,substringrep,stringdetect,stringcount,loc))
-}
-
-
-
-##input
-#x & y:dataframe
-#strngindex: output from  duplicateindex function(loc)
-##ouptputs
-#s1:numeric vector ,
-#s2:numeric vector, 
-#index1:indexes of s1, 
-#index2:indexes of s2
-#
-extractindex<-function(x,y,strngindex){
-  
-   s1<-x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],"value"]
-   s2<-y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],"value"]
-   indexs1<-row.names(x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],])
-   indexs2<-row.names(y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],])
-
-  return(list(s1,s2,indexs1,indexs2))
-
-}
-
-
-###input
-#x:a list of indexes
-##ouptputs
-#indexnum1:numeric equivalent of indexes
-
-extractvalue<-function(x){
-  for (i in 1:length(x)[[1]])
-  indexnum <- c(x[[i]]) 
-  indexnum1<-as.numeric(indexnum)
-return(indexnum1)
-}
-
-##main
-#
-values<-stringcnvert(df1$mgvalue,df2$mgvalue,8) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better.
-strngindex<-duplicateindex(values)
-validate<-extractindex(df1,df2,strngindex)
-seq1<-extractvalue(validate[3])
-seq2<-extractvalue(validate[4])
-
-df1_mgvalue<-patient[seq1,"mgvalue"]
-df2_mgvalue<-patient[seq2,"mgvalue"]
-compare<-data.frame(df1_mgvalue,df2_mgvalue)
-
-# par(mfrow=c(1,2))
-# plot.ts(compare$df1_mgvalue)
-# plot.ts(compare$df2_mgvalue)
-
-plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), 
+system.time(values <- StringConvert(df1$mgvalue, df2$mgvalue,3))  #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better.
+system.time(strngindex <- IndexRange(values))
+validate <- ExtractDuplicateIndex(df1, df2, strngindex)
+seq1 <- ExtractVectorIndex(validate[3])
+seq2 <- ExtractVectorIndex(validate[4])
+df1_mgvalue <- patient[seq1,"value"]
+df2_mgvalue <- patient[seq2,"value"]
+compare <- data.frame(df1_mgvalue,df2_mgvalue)
+
+#plot duplicated values
+plot.zoo(cbind(compare$df1_mgvalue, compare$df2_mgvalue), 
          plot.type = "multiple", 
          col = c("red", "blue"))
-
-
-sum(compare$df1_mgvalue-compare$df2_mgvalue)
-
-
+sum(compare$df1_mgvalue - compare$df2_mgvalue)
 ```
 
-
-
 ```{r}
-values<-stringcnvert(df3$mgvalue,df4$mgvalue,3) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. ##3 worked better for this run
-strngindex<-duplicateindex(values)
-validate<-extractindex(df3,df4,strngindex)
-seq1<-extractvalue(validate[3])
-seq2<-extractvalue(validate[4])
-
-df1_mgvalue<-patient[seq1,"mgvalue"]
-df2_mgvalue<-patient[seq2,"mgvalue"]
-compare<-data.frame(df1_mgvalue,df2_mgvalue)
-
-# par(mfrow=c(1,2))
-# plot.ts(compare$df1_mgvalue)
-# plot.ts(compare$df2_mgvalue)
-
-plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), 
+values <- StringConvert(df3$mgvalue, df4$mgvalue,3 ) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. ##3 worked better for this run
+strngindex <- IndexRange(values)
+validate <- ExtractDuplicateIndex(df3,df4,strngindex)
+seq1 <- ExtractVectorIndex(validate[3])
+seq2 <- ExtractVectorIndex(validate[4])
+
+df1_mgvalue <- patient[seq1,"value"]
+df2_mgvalue <- patient[seq2,"value"]
+compare <- data.frame(df1_mgvalue, df2_mgvalue)
+plot.zoo(cbind(compare$df1_mgvalue, compare$df2_mgvalue), 
          plot.type = "multiple", 
          col = c("red", "blue"))
-
-
-sum(compare$df1_mgvalue-compare$df2_mgvalue)
+sum(compare$df1_mgvalue - compare$df2_mgvalue)
 ```
 
-
-
-
 ```{r}
-patient<-read.csv("duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv")[,field]
-patient$mgvalue<-patient$value*18.01559
-length(unique(patient$uploadId)) #2
-length(unique(patient$id))==nrow(patient) ##True
-length(unique(patient$utcTime))# 8899
-anyDuplicated(patient$utcTime) #4366
-length(unique(patient$utcTime))==nrow(patient) #False
-
-
-
+patient <- read.csv("filename")[,field] #0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv
+patient$mgvalue <- patient$value*18.01559
+length(unique(patient$uploadId)) 
+length(unique(patient$id))==nrow(patient) 
+length(unique(patient$utcTime))
+anyDuplicated(patient$utcTime) 
+length(unique(patient$utcTime))==nrow(patient) 
 df1<-subset(patient,patient$uploadId=="2f61322480c841fd8679fe81e94930b2")
-
-
 df2<-subset(patient,patient$uploadId=="c05970591b404518a1cbd64595d628e5")
-  
-
-
 ```
 
-
 ```{r}
-values<-stringcnvert(df1$mgvalue,df2$mgvalue,8)
-strngindex<-duplicateindex(values)
-validate<-extractindex(df1,df2,strngindex)
-seq1<-extractvalue(validate[3])
-seq2<-extractvalue(validate[4])
-
-df1_mgvalue<-patient[seq1,"mgvalue"]
-df2_mgvalue<-patient[seq2,"mgvalue"]
+values <- StringConvert(df1$mgvalue,df2$mgvalue,3)
+strngindex <- IndexRange(values)
+validate <- ExtractDuplicateIndex(df1,df2,strngindex)
+seq1 <- ExtractVectorIndex(validate[3])
+seq2 <- ExtractVectorIndex(validate[4])
+
+df1_mgvalue<-patient[seq1,"value"]
+df2_mgvalue<-patient[seq2,"value"]
 compare<-data.frame(df1_mgvalue,df2_mgvalue)
 
-
-
 plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), 
          plot.type = "multiple", 
          col = c("red", "blue"))
-
 sum(compare$df1_mgvalue-compare$df2_mgvalue)
 
-
 ```
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-#cross correlation

From 6ac33e872e44f9c4cf802d06d9c7e7541ed9acbb Mon Sep 17 00:00:00 2001
From: Amenze Okpah <eaokpah@dons.usfca.edu>
Date: Wed, 15 Aug 2018 14:06:52 -0700
Subject: [PATCH 9/9] Data Deduplication functions

---
 deduplicationFunction.R | 72 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 deduplicationFunction.R

diff --git a/deduplicationFunction.R b/deduplicationFunction.R
new file mode 100644
index 00000000..1f5a0e48
--- /dev/null
+++ b/deduplicationFunction.R
@@ -0,0 +1,72 @@
+StringConvert <- function(x, y, alpha.Size){
+  #
+  # Coverts two vectors string equivalent
+  # Args:
+  #   x:vector value for uploadid x 
+  # y:vector value for uploadid y
+  # alpha.Size: Alphabet size.
+  # Returns:
+  #   longeststring:longest common substring match between discretized value of x and y respectively stringX,stringY
+  # percentage similarity between vector x and y
+  if (length(x) != length(y)){
+    normvalue = x
+  }
+  else if(length(x) == length(y)){
+    normvalue = x
+  }
+  normvalue.Mean <- mean(normvalue)
+  normvalue.Dev <- sd(normvalue)
+  xnormalized <- (x - normvalue.Mean) / normvalue.Dev
+  ynormalized <- (y - normvalue.Mean) / normvalue.Dev
+  X.PAA = paa(xnormalized, length(x)) 
+  y.PAA = paa(ynormalized,  length(y))
+  xString.Value <- series_to_string(X.PAA, alpha.Size)
+  yString.Value <- series_to_string(y.PAA, alpha.Size) 
+  stringX <- xString.Value
+  stringY <- yString.Value
+  longeststring<-LCSn(c(stringX,stringY))
+  return (list(longeststring, stringX, stringY, (levenshteinSim(xString.Value, yString.Value))))
+}
+
+
+IndexRange<- function(values){
+  #computes the range of matching character between string values and common subsequence
+  # Args:
+  #   values:output from stringcnvert function 
+  # Returns:
+  #   Loc:Matching Ranges
+  strings.Values <- as.character(c(values[2],values[3]))
+  common.SubSequence <- as.character(values[1])
+  loc <- str_locate(strings.Values, common.SubSequence)
+  return(list(loc))
+}
+
+
+ExtractDuplicateIndex <- function(x, y, IndexRange){
+  ##computes the duplicated values for each vectors and their indexes 
+  # Args:
+  #   x & y: vectors
+  # Returns:
+  #   duplicated.X:duplicated value for vextor x
+  # duplicated.Y:duplicated value for vextor y
+  # index.X:duplicated values index for vector x
+  # index.Y:duplicated values index for vector y
+  duplicate.X <- x[IndexRange[[1]][1,][1] : IndexRange[[1]][1,][2], "value"]
+  duplicate.Y <- y[IndexRange[[1]][2,][1] : IndexRange[[1]][2,][2], "value"]
+  index.X <- row.names(x[IndexRange[[1]][1,][1] : IndexRange[[1]][1,][2],])
+  index.Y <- row.names(y[IndexRange[[1]][2,][1] : IndexRange[[1]][2,][2],])
+  return(list(duplicate.X, duplicate.Y, index.X, index.Y))
+}
+
+
+ExtractVectorIndex<- function(x){
+  ##computes a list of indexes 
+  # Args:
+  #   x: vectors
+  # Returns:
+  #   duplicate.Index:duplicated value index
+  for (i in 1:length(x))
+    indexnum <- c(x[[i]]) 
+  duplicate.Index<-as.numeric(indexnum)
+  return(duplicate.Index)
+}
\ No newline at end of file