-
Notifications
You must be signed in to change notification settings - Fork 6
data de-duplication using distances #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
659a938
bb99788
360d1dc
611c7e6
8fccbbb
62faa55
ce70580
73bf825
6ac33e8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,384 @@ | ||
| --- | ||
| title: "uploadid" | ||
| author: "Amenze" | ||
| date: "June 28, 2018" | ||
| output: html_document | ||
| --- | ||
|
|
||
| ```{r} | ||
| setwd("/Users/Amenze/Desktop/tidepool/refdata") | ||
| ``` | ||
|
|
||
|
|
||
| ```{r } | ||
| #library(data.table) | ||
| library(ggplot2) | ||
| library(plyr) | ||
| library(dplyr) | ||
| #using jmotif | ||
| library(jmotif) | ||
| #install.packages("RecordLinkage") | ||
| library(RecordLinkage) | ||
| #install.packages("PTXQC") | ||
| library("PTXQC") | ||
| library(stringr) | ||
| library(zoo) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a similar block to this that includes the |
||
|
|
||
|
|
||
|
|
||
| ``` | ||
|
|
||
|
|
||
|
|
||
|
|
||
| ```{r} | ||
|
|
||
| # create a list from these files | ||
| list.filenames<-list.files("/Users/Amenze/Desktop/tidepool/refdata",pattern=".csv$") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's make this a choose dialog since this directory path isn't going to work on anyone else's computer. |
||
|
|
||
|
|
||
| #extract files based on duplicated utctime | ||
| for (i in 1:length(list.filenames)) | ||
| { | ||
| patient<-read.csv(list.filenames[i]) | ||
| patient_cbg<-subset(patient,patient$type=="cbg") | ||
| if ((length(unique(patient_cbg$utcTime)))!=nrow(patient_cbg)) | ||
| write.csv(patient_cbg,paste0("/Users/Amenze/Desktop/tidepool/refdata/duplicated",list.filenames[i])) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hardcoded path.... |
||
|
|
||
| } | ||
| ``` | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove unnecessary blank lines |
||
|
|
||
|
|
||
|
|
||
|
|
||
| ```{r} | ||
| ##read in file and check for duplicated utc | ||
|
|
||
| field<-c("deviceId","id","uploadId","utcTime","type","value") | ||
| patient<-read.csv("duplicated0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv")[,field] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hardcoded path... |
||
| patient$mgvalue<-patient$value*18.01559 | ||
| length(unique(patient$uploadId)) #1319 | ||
| length(unique(patient$id))==nrow(patient) ##True | ||
| length(unique(patient$utcTime))#72287 | ||
| length(unique(patient$utcTime))==nrow(patient) #False | ||
|
|
||
|
|
||
|
|
||
| ###subset patient file based on 5 uploadids | ||
| df1<-subset(patient,patient$uploadId=="upid_3c41703c2d3a8b97f479afdb6ccf799f") | ||
|
|
||
| df2<-subset(patient,patient$uploadId=="upid_3fc32e5ad912a8ea7efced9151804bdb") | ||
|
|
||
|
|
||
| df3<-subset(patient,patient$uploadId=="upid_17db2d2a0ae0e02a12c0a5067e5fe85b") | ||
|
|
||
|
|
||
| df4<-subset(patient,patient$uploadId=="upid_5fad608cf32bd03a1cd56e3bb1fdb834") | ||
|
|
||
|
|
||
|
|
||
| df5<-subset(patient,patient$uploadId=="upid_830c6de3e2ecbbec6fbad0cecc64bdf5") | ||
|
|
||
| #plot values for each uploadid | ||
| par(mfrow=c(3,3)) | ||
|
|
||
|
|
||
| plot.ts(df1$value) | ||
| plot.ts(df2$value) | ||
| plot.ts(df3$value) | ||
| plot.ts(df4$value) | ||
| plot.ts(df5$value) | ||
|
|
||
| ``` | ||
|
|
||
|
|
||
|
|
||
| ```{r} | ||
| ## input | ||
| #x:vector value for uploadid x | ||
| #y: vector value for uploadid y | ||
| #outputs | ||
| #longeststring:longest common substring match between discretize value of x and y respectively p1 & p2 | ||
| #percentage similarity | ||
| #alphaxy:the alphabet size | ||
|
|
||
| stringcnvert<-function(x,y,alphaxy){ | ||
| if (length(x)!=length(y)){ | ||
| normvalue=x | ||
| } | ||
| else if(length(x)==length(y)){ | ||
| normvalue=x | ||
| } | ||
| normvalue.mean <- mean(normvalue) | ||
| normvalue.dev<-sd(normvalue) | ||
| xznorm<-(x - normvalue.mean)/normvalue.dev | ||
| yznorm<-(y - normvalue.mean)/normvalue.dev | ||
| y_paa1 = paa(xznorm,length(x)) ##we decided to use the exact length of the vector based on what we want to achieve | ||
| y_paa2 = paa(yznorm,length(y)) | ||
| xstringvalue<-series_to_string(y_paa1, alphaxy) | ||
| ystringvalue<-series_to_string(y_paa2, alphaxy) | ||
| p1<-xstringvalue | ||
| p2<-ystringvalue | ||
| longeststring<-LCSn(c(p1,p2)) | ||
| return (list(longeststring,p1,p2,(levenshteinSim(xstringvalue,ystringvalue)))) | ||
|
|
||
|
|
||
| } | ||
|
|
||
|
|
||
|
|
||
|
|
||
| ##input | ||
| #values:output from stringcnvert function | ||
| ##ouptputs | ||
| #stringvalue: 2 strings compared | ||
| #substringrep:matching substring that was compared, | ||
| #stringdetect:boolean eqivalent of string detected(TRUE:if detected, FALSE:not detected) | ||
| #stringcount:count of substrng in stringvalue | ||
| #loc:starting and ending indexes for substring in the two strings compared. | ||
|
|
||
| duplicateindex<-function(values){ | ||
| stringsvalues <- as.character(c(values[2],values[3])) | ||
| substringrep <- as.character(values[1]) | ||
| stringdetect<-str_detect(stringsvalues,substringrep) | ||
| stringcount<-str_count(stringsvalues, substringrep) | ||
| loc <- str_locate(stringsvalues,substringrep ) | ||
| return(list(stringsvalues,substringrep,stringdetect,stringcount,loc)) | ||
| } | ||
|
|
||
|
|
||
|
|
||
| ##input | ||
| #x & y:dataframe | ||
| #strngindex: output from duplicateindex function(loc) | ||
| ##ouptputs | ||
| #s1:numeric vector , | ||
| #s2:numeric vector, | ||
| #index1:indexes of s1, | ||
| #index2:indexes of s2 | ||
| # | ||
| extractindex<-function(x,y,strngindex){ | ||
|
|
||
| s1<-x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],"value"] | ||
| s2<-y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],"value"] | ||
| indexs1<-row.names(x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],]) | ||
| indexs2<-row.names(y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],]) | ||
|
|
||
| return(list(s1,s2,indexs1,indexs2)) | ||
|
|
||
| } | ||
|
|
||
|
|
||
| ###input | ||
| #x:a list of indexes | ||
| ##ouptputs | ||
| #indexnum1:numeric equivalent of indexes | ||
|
|
||
| extractvalue<-function(x){ | ||
| for (i in 1:length(x)[[1]]) | ||
| indexnum <- c(x[[i]]) | ||
| indexnum1<-as.numeric(indexnum) | ||
| return(indexnum1) | ||
| } | ||
|
|
||
| ##main | ||
| # | ||
| system.time(values<-stringcnvert(df1$mgvalue,df2$mgvalue,3))#http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. | ||
| #user system elapsed | ||
| # 0.11 0.00 0.11 | ||
|
|
||
| system.time(strngindex<-duplicateindex(values)) | ||
| validate<-extractindex(df1,df2,strngindex) | ||
| seq1<-extractvalue(validate[3]) | ||
| seq2<-extractvalue(validate[4]) | ||
|
|
||
| df1_mgvalue<-patient[seq1,"mgvalue"] | ||
| df2_mgvalue<-patient[seq2,"mgvalue"] | ||
| compare<-data.frame(df1_mgvalue,df2_mgvalue) | ||
|
|
||
| # par(mfrow=c(1,2)) | ||
| # plot.ts(compare$df1_mgvalue) | ||
| # plot.ts(compare$df2_mgvalue) | ||
|
|
||
| plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), | ||
| plot.type = "multiple", | ||
| col = c("red", "blue")) | ||
|
|
||
|
|
||
| sum(compare$df1_mgvalue-compare$df2_mgvalue) | ||
|
|
||
|
|
||
| ``` | ||
|
|
||
|
|
||
|
|
||
| ```{r} | ||
| values<-stringcnvert(df3$mgvalue,df4$mgvalue,3 ) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. ##3 worked better for this run | ||
| strngindex<-duplicateindex(values) | ||
| validate<-extractindex(df3,df4,strngindex) | ||
| seq1<-extractvalue(validate[3]) | ||
| seq2<-extractvalue(validate[4]) | ||
|
|
||
| df1_mgvalue<-patient[seq1,"mgvalue"] | ||
| df2_mgvalue<-patient[seq2,"mgvalue"] | ||
| compare<-data.frame(df1_mgvalue,df2_mgvalue) | ||
|
|
||
| # par(mfrow=c(1,2)) | ||
| # plot.ts(compare$df1_mgvalue) | ||
| # plot.ts(compare$df2_mgvalue) | ||
|
|
||
| plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), | ||
| plot.type = "multiple", | ||
| col = c("red", "blue")) | ||
|
|
||
|
|
||
| sum(compare$df1_mgvalue-compare$df2_mgvalue) | ||
| ``` | ||
|
|
||
|
|
||
|
|
||
|
|
||
| ```{r} | ||
| patient<-read.csv("duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv")[,field] | ||
| patient$mgvalue<-patient$value*18.01559 | ||
| length(unique(patient$uploadId)) #2 | ||
| length(unique(patient$id))==nrow(patient) ##True | ||
| length(unique(patient$utcTime))# 8899 | ||
| anyDuplicated(patient$utcTime) #4366 | ||
| length(unique(patient$utcTime))==nrow(patient) #False | ||
|
|
||
|
|
||
|
|
||
| df1<-subset(patient,patient$uploadId=="2f61322480c841fd8679fe81e94930b2") | ||
|
|
||
|
|
||
| df2<-subset(patient,patient$uploadId=="c05970591b404518a1cbd64595d628e5") | ||
|
|
||
|
|
||
|
|
||
| ``` | ||
|
|
||
|
|
||
| ```{r} | ||
| values<-stringcnvert(df1$mgvalue,df2$mgvalue,8) | ||
| strngindex<-duplicateindex(values) | ||
| validate<-extractindex(df1,df2,strngindex) | ||
| seq1<-extractvalue(validate[3]) | ||
| seq2<-extractvalue(validate[4]) | ||
|
|
||
| df1_mgvalue<-patient[seq1,"mgvalue"] | ||
| df2_mgvalue<-patient[seq2,"mgvalue"] | ||
| compare<-data.frame(df1_mgvalue,df2_mgvalue) | ||
|
|
||
|
|
||
|
|
||
| plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), | ||
| plot.type = "multiple", | ||
| col = c("red", "blue")) | ||
|
|
||
| sum(compare$df1_mgvalue-compare$df2_mgvalue) | ||
|
|
||
|
|
||
| ``` | ||
|
|
||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Blank lines... |
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
| #cross correlation | ||
| ```{r} | ||
| corre<-function(x,y,lagsize){ | ||
| corr<-ccf(x$mgvalue,y$mgvalue,max.lag=lagsize,plot=TRUE) | ||
| max.value<-max(corr$acf) | ||
| max.lag<-corr$lag[which(corr$acf==max.value)] | ||
| xlength<-length(x$mgvalue) | ||
| ylength<-length(y$mgvalue) | ||
| if ((max.lag==0) & (xlength<ylength)) { | ||
| computelength<-xlength | ||
| xvalue<-x | ||
| yvalue<-y[max.lag:computelength,] | ||
| } | ||
| else if ((max.lag==0)& (ylength<xlength)){ | ||
| computelength=ylength | ||
| xvalue=x[1:computelength,] | ||
| yvalue=y | ||
| } | ||
| else if((max.lag==0) & (xlength==ylength)){ | ||
| xvalue=x | ||
| yvalue=y | ||
| } | ||
|
|
||
| if ((max.lag>0)& (xlength<ylength)){ | ||
| startcomputelength=max.lag | ||
| endcomputelength<-xlength-max.lag | ||
| xvalue=x[max.lag:endcomputelength,] | ||
| yvalue=y[max.lag:endcomputelength,] | ||
| } | ||
| if ((max.lag>0)&(ylength<xlength)){ | ||
| startcomputelength<-max.lag | ||
| endcomputelength<-ylength-max.lag | ||
| xvalue<-x[max.lag:endcomputelength,] | ||
| yvalue<-y[max.lag:endcomputelength,] | ||
| } | ||
| else if((max.lag>0) & (xlength==ylength)){ | ||
| startcomputelength<-max.lag | ||
| xvalue=x[max.lag:length(x),] | ||
| yvalue=y[max.lag:length(y),] | ||
| } | ||
| if (max.lag<0){ | ||
| startcomputelength<-abs(max.lag) | ||
| xendcomputelength<-xlength-startcomputelength | ||
| yendcomputelength<-ylength-startcomputelength | ||
| xvalue<-x[xendcomputelength:1,] | ||
| yvalue<-y[ylength:(startcomputelength+1),] | ||
| } | ||
|
|
||
| return (list(df1=xvalue,df2=yvalue,max_correlation=max.value)) | ||
|
|
||
| } | ||
|
|
||
| system.time(corr<-corre(df1,df2,10)) | ||
| df1adj<-corr$df1 | ||
| df2adj<-corr$df2 | ||
|
|
||
| #ccf(df1$mgvalue,df2$mgvalue) | ||
|
|
||
|
|
||
| ##extracte the index | ||
|
|
||
| t1indexvalue<-row.names(df1adj) | ||
| t2indexvalue<-row.names(df2adj) | ||
| ts1<-patient[t1indexvalue,"mgvalue"] | ||
| ts2<-patient[t2indexvalue,"mgvalue"] | ||
|
|
||
| plot.zoo(cbind(ts1,ts2), | ||
| plot.type = "multiple", | ||
| col = c("red", "blue")) | ||
| sum(ts1-ts2) | ||
|
|
||
|
|
||
| ``` | ||
|
|
||
|
|
||
| ```{r} | ||
| corr<-corre(df3,df4,10) | ||
| df1adj<-corr$df1 | ||
| df2adj<-corr$df2 | ||
|
|
||
|
|
||
|
|
||
| ``` | ||
|
|
||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove this since there's no guarantee anyone else will have the same directory structure.