From 417fa87b895cec7b2fd44b09a1f65282ec8544b5 Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Sat, 22 Jun 2024 08:48:11 -1000 Subject: [PATCH 01/19] Add files via upload --- R/model_ivpin.R | 905 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 905 insertions(+) create mode 100644 R/model_ivpin.R diff --git a/R/model_ivpin.R b/R/model_ivpin.R new file mode 100644 index 0000000..8e43c0d --- /dev/null +++ b/R/model_ivpin.R @@ -0,0 +1,905 @@ + +## - | FILE HEADER | +## +## Script name: +## model_ivpin.R +## +## Purpose of script: +## Implement the estimation of the Intraday Volume-synchronized PIN +## measure (IVPIN). +## +## Author: +## Alexandre Borentain +## +## Last updated: +## 2024-06-18 +## +## License: +## GPL 3 +## +## Email: +## aboren@uw.edu +## +## +## +## Public functions: +## ++++++++++++++++++ +## +## ivpin(): +## Estimates the Intraday Volume-Synchronized Probability of Informed +## Trading. +## +## ++++++++++++++++++ +## +## +## -- +## Package: PINstimation +## website: www.pinstimation.com +## Authors: Montasser Ghachem and Oguz Ersan + + +## +++++++++++++++++++++++++ +## ++++++| | PUBLIC FUNCTIONS | | +## +++++++++++++++++++++++++ + + +#' @title Estimation of Intraday Volume-Synchronized PIN model +#' +#' @description Estimates the Intraday Volume-Synchronized Probability of Informed +#' Trading. +#' +#' @usage ivpin(data, timebarsize = 60, buckets = 50, samplength = 50, +#' tradinghours = 24, verbose = TRUE) +#' +#' @param data A dataframe with 3 variables: +#' \code{timestamp, price, volume}. +#' @param timebarsize An integer referring to the size of timebars +#' in seconds. The default value is \code{60}. +#' @param buckets An integer referring to the number of buckets in a +#' daily average volume. The default value is \code{50}. +#' @param samplength An integer referring to the sample length +#' or the window size used to calculate the `IVPIN` vector. +#' The default value is \code{50}. +#' @param tradinghours An integer referring to the length of daily +#' trading sessions in hours. The default value is \code{24}. +#' +#' @param verbose A binary variable that determines whether detailed +#' information about the steps of the estimation of the IVPIN model is displayed. +#' No output is produced when \code{verbose} is set to \code{FALSE}. The default +#' value is \code{TRUE}. +#' +#' @details The dataframe data should contain at least three variables. Only the +#' first three variables will be considered and in the following order +#' \code{timestamp, price, volume}. +#' +#' The property \code{@bucketdata} is created as in +#' \insertCite{abad2012;textual}{PINstimation}. +#' +#' The argument `timebarsize` is in seconds enabling the user to implement +#' shorter than `1` minute intervals. The default value is set to `1` minute +#' (`60` seconds). +#' +#' The parameter `tradinghours` is used to eventually correct the duration per +#' bucket. The duration of a given bucket is the difference between the +#' timestamp of the last trade `endtime` and the timestamp of the first trade +#' `stime` in the bucket. If the first trade and the last trade in a +#' bucket occur in two different days, and the market trading session does not +#' cover a full day \code{(24 hours)}; then the duration of the bucket will be +#' inflated. Assume that the daily trading session is 8 hours +#' `(tradinghours=8)`, the start time of a bucket is \code{2018-10-12 17:06:40} +#' and its end time is \code{2018-10-13 09:36:00}. A straightforward calculation +#' gives that the duration of this bucket is \code{59,360 secs}. However, this +#' duration includes the time during which the market is closed `(16 hours)`. +#' The corrected duration takes into consideration only the time of market +#' activity: `duration=59,360-16*3600= 1760 secs`, i.e., about `30 minutes`. +#' +#' @return Returns an object of class \code{estimate.ivpin}. +#' +#' @references +#' +#' \insertAllCited +#' +#' @examples +#' # There is a preloaded dataset called 'hfdata' contained in the package. +#' # It is an artificially created high-frequency trading data. The dataset +#' # contains 100 000 trades and five variables 'timestamp', 'price', +#' # 'volume', 'bid' and 'ask'. For more information, type ?hfdata. +#' +#' xdata <- hfdata +#' +#' # Estimate IVPIN model, using the following parameter set where the time +#' # bar size is 5 minutes, i.e., 300 seconds (timebarsize = 300), 50 +#' # buckets per average daily volume (buckets = 50), and a window size of +#' # 250 for the IVPIN calculation (samplength = 250). +#' +#' estimate <- ivpin(xdata, timebarsize = 300, buckets = 50, samplength = 250) +#' +#' # Display a description of the estimate +#' +#' show(estimate) +#' +#' # Plot the estimated IVPIN vector +#' +#' plot(estimate@vpin, type = "l", xlab = "time", ylab = "IVPIN", col = "blue") +#' +#' # Display the parameters of IVPIN estimates +#' +#' show(estimate@parameters) +#' +#' # Store the computed data of the different buckets in a dataframe 'buckets'. +#' # Display the first 10 rows of the dataframe 'buckets'. +#' +#' buckets <- estimate@bucketdata +#' show(head(buckets, 10)) +#' +#' # Store the daily IVPIN values (weighted and unweighted) in a dataframe +#' # 'dayivpin'. +#' +#' # Display the first 10 rows of the dataframe 'dayivpin'. +#' +#' dayivpin <- estimate@dailyvpin +#' show(head(dayivpin, 10)) +#' +#' @export +ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, + tradinghours = 24, verbose = TRUE) { + + " + @timebarsize : the size of timebars in seconds default value: 60 + @buckets : number of buckets per volume of bucket size default value: 50 + @samplength : sample length or window of buckets to calculate iVPIN default + value: 50 + @tradinghours : length of trading days - used to correct the durations of + buckets. Default value is 24. + " + sigmoid=function(x){ + y=1/(1+exp(x)) + return(y) + } + logit=function(y){ + x=log(1/y-1) + return(x) + } + vpin_err <- uierrors$vpin() + vpin_ms <- uix$vpin(timebarsize = timebarsize) + + # Check that all variables exist and do not refer to non-existent variables + # -------------------------------------------------------------------------- + allvars <- names(formals()) + environment(.xcheck$existence) <- environment() + .xcheck$existence(allvars, err = uierrors$vpin()$fn) + + # Checking the arguments of the function are valid + # ------------------------------------------------------------------------- + largs <- list(data, timebarsize, buckets, samplength, tradinghours, verbose) + names(largs) <- names(formals()) + rst <- .xcheck$args(arglist = largs, fn = "vpin") + ux$stopnow(rst$off, m = rst$error, s = uierrors$vpin()$fn) + + # Convert data into a dataframe, in case it is a matrix or an array + data <- as.data.frame(data) + + # Initialize the local variables + tbv <- vbs <- bucket <- NULL + estimatevpin <- new("estimate.vpin") + + time_on <- Sys.time() + + ux$show(c= verbose, m = vpin_ms$start) + + ############################################################################## + # STEP 0 : CHECKING AND PREPARING THE DATA + ############################################################################## + + # ---------------------------------------------------------------------------- + # USER MESSAGE + # ---------------------------------------------------------------------------- + + ux$show(c= verbose, m = vpin_ms$step1) + + # We want only three columns: timestamp, price and volume so we import into + # the dataset only the three first columns + + dataset <- data[, 1:3] + + # We rename the first three columns to "timestamp", "price" and "volume" + + colnames(dataset) <- c("timestamp", "price", "volume") + + # We check if the columns price and volume are numeric so that we can make + # operations on them. If they are not, convert them to numeric. + + dataset$price <- as.numeric(dataset$price) + + dataset$volume <- as.numeric(dataset$volume) + + dataset$timestamp <- as.POSIXct(dataset$timestamp) + + rownames(dataset) <- NULL + + # If the argument 'timebarsize' is larger than the total duration of the + # datasets in milliseconds, the abort. + alltime <- 1000 * as.numeric(difftime(max(dataset$timestamp), + min(dataset$timestamp), units = "secs")) + + if (timebarsize >= alltime) { + ux$show(m = vpin_ms$aborted, warning = TRUE) + ux$stopnow(m = vpin_err$largetimebarsize, s = vpin_err$fn) + } + + ############################################################################## + # STEP 1 : CREATING THE T-SECOND TIMEBARS DATASET + ############################################################################## + + # ---------------------------------------------------------------------------- + # USER MESSAGE + # ---------------------------------------------------------------------------- + + ux$show(c= verbose, m = vpin_ms$step2, skip = FALSE) + + # ---------------------------------------------------------------------------- + # I.1 CREATE THE VARIABLE INTERVAL + # ---------------------------------------------------------------------------- + + # Create a variable called interval which contains the timebar extracted from + # the timestamp. If timebarsize == 60, then the observation of timestamp + # "2019-04-01 00:33:49" belong to the interval "2019-04-01 00:33:00", that is + # the timebar that starts at the minute 33 and lasts 1 minute (60 seconds) + # (timebarsize) + + # If timebarsize == 300, then the observation of timestamp "2019-04-01 00:33:49" + # belong to the interval "2019-04-01 00:30:00", that is the timebar that + # starts at the minute 30 and lasts 5 minutes (300 seconds) (timebarsize) + + tbsize <- paste(timebarsize, " sec", sep = "") + + dataset$interval <- cut(dataset$timestamp, breaks = tbsize) + + # ---------------------------------------------------------------------------- + # I.2 CREATE THE T-MINUTE TIMEBAR DATASET CALLED 'MINUTEBARS' + # ---------------------------------------------------------------------------- + + # Create t-minute timebar dataset which will aggregate the transaction volume + # (tbv: timebar volume) and price change: last price - first price (dp) per + # interval + + # ---------------------------------------------------------------------------- + # - THIS PART IS JUST TO ESTIMATE RUNNING TIME - NO OUTPUT USED - + # - - + diffseconds <- function(time_on, time_off) { + dsecs <- difftime(time_off, time_on, units = "secs") + return(dsecs) + } + + if (nrow(dataset) >= 50000) { + + temptime_on <- Sys.time() + + chunk <- 5000 + + tempbars <- aggregate(price ~ interval, data = dataset[1:chunk, ], + FUN = function(x) dp <- tail(x, 1) - head(x, 1) + ) + tempbars <- merge(tempbars, + aggregate(volume ~ interval, dataset[1:chunk, ], sum), + by = "interval" + ) + tempbars$interval <- as.POSIXct(tempbars$interval, tz = "") + + temptime_off <- Sys.time() + + exptime <- ux$timediff(temptime_on, temptime_off, + 5*log2(nrow(dataset) / (chunk))) + + ux$show(c= verbose, m = paste("[~", ceiling(exptime), "seconds]")) + + } else { + + ux$show(c= verbose, m = "") + } + + # - - + # ---------------------------------------------------------------------------- + + minutebars <- aggregate(price ~ interval, data = dataset, + FUN = function(x) dp <- tail(x, 1) - head(x, 1) + ) + minutebars <- merge(minutebars, + aggregate(volume ~ interval, dataset, sum), + by = "interval") + minutebars$interval <- as.POSIXct(minutebars$interval, tz = "") + + colnames(minutebars) <- c("interval", "dp", "tbv") + minutebars <- minutebars[complete.cases(minutebars), ] + + ############################################################################ + # STEP 2 : CALCULATING VOLUME BUCKET SIZE AND SIGMA(DP) + ############################################################################ + + # -------------------------------------------------------------------------- + # USER MESSAGE + # -------------------------------------------------------------------------- + + ux$show(c= verbose, m = vpin_ms$step3) + + # -------------------------------------------------------------------------- + # II.1 CALCULATE NUMBER OF DAYS (NDAYS), TOTAL VOLUME (TOTVOL) AND THEN VBS + # -------------------------------------------------------------------------- + + # Description of the variables: + # ndays : number of unique days in the dataset minutebars + # totvol : total volume over all dataset + # vbs : VOLUME BUCKET SIZE + + ndays <- length(unique(substr(minutebars$interval, 1, 10))) + + totvol <- sum(minutebars$tbv) + vbs <- (totvol / ndays) / buckets + + params <- c(timebarsize, buckets, samplength, vbs, ndays) + + names(params) <- c("tbSize", "buckets", "samplength", "VBS", "ndays") + + estimatevpin@parameters <- params + + # -------------------------------------------------------------------------- + # II.2 CALCULATE THE STANDARD DEVIATION OF DP (SDP) + # -------------------------------------------------------------------------- + + sdp <- sd(minutebars$dp) + + ############################################################################ + # STEP 3 : BREAKING UP LARGE T-MINUTE TIME BARS' VOLUME + ############################################################################ + + # -------------------------------------------------------------------------- + # USER MESSAGE + # -------------------------------------------------------------------------- + + ux$show(c= verbose, m = vpin_ms$step4) + + # -------------------------------------------------------------------------- + # TASK DESCRIPTON + # -------------------------------------------------------------------------- + + # To avoid that one time bar spans over many buckets, we have to make sure + # to divide all large enough timebars into 'identical' timebars but with trade + # volume lower or equal to a given threshold. The threshold is a function of + # vbs. + + # The threshold is chosen in the code to be the following function of vbs: + # threshold = (1-1/x)*vbs where x is a measure of precision. If x=1, then the + # threshold is 0;if x=2 then the threshold = 50% of vbs; if x=10 then the + # threshold = 90% vbs. + + # We give an example here about what we intend to do. Assume that vbs= 1250 + # and x=5; the threshold is then: threshold = (1-1/5) vbs = 80% vbs = + # 0.8*1250 = 1000. This means we will break all minutebars with volume (tbv) + # higher than 1000 into identical timebars but with volume lower or equal + # to threshold= 1000. + + # Imagine we have the following timebar: + # + # minute dp tbv + # ------------------------------- + # 2019-04-02 04:53 53.0 5340. + + # Since threshold= 1000 we want to break this timebar into six 'identical' + # timebars (time and dp) but will volume (tbv) smaller or equal to threshold. + # The expected result is: + # + # minute dp tbv + # ------------------------------- + # 2019-04-02 04:53 53.0 1000. + # 2019-04-02 04:53 53.0 1000. + # 2019-04-02 04:53 53.0 1000. + # 2019-04-02 04:53 53.0 1000. + # 2019-04-02 04:53 53.0 1000. + # 2019-04-02 04:53 53.0 340. + + # -------------------------------------------------------------------------- + # III.1 DEFINE THE THRESHOLD AND FIND ALL TIMEBARS WITH VOLUME > THRESHOLD + # -------------------------------------------------------------------------- + + # A variable 'id' is assigned to each timebar to easily identify them in + # future tasks. + + minutebars$id <- seq.int(nrow(minutebars)) + + # Define the precision factor (x) and calculate the threshold + + x <- 10 + threshold <- (1 - 1 / x) * vbs + + # Find all timebars with volume higher than the threshold and store them in + # largebars + # Find the position of all these timebars in the original dataset and store + # them in the vector largebnum + + largebars <- subset(minutebars, tbv > threshold) + + largebnum <- NULL + if (nrow(largebars) != 0) + largebnum <- which(minutebars$id %in% largebars$id) + + # -------------------------------------------------------------------------- + # III.2 BREAK DOWN LARGE VOLUME TIMEBARS INTO SMALLER ONES + # -------------------------------------------------------------------------- + + # We break down these large volume timebars into replicated timebars with a + # maximum volume equal to threshold. If, for example, the timebar has volume + # 5340 and threshold = 1000, we create five (5)'identical' timebars with a + # volume 1000 each and and one additional timebar containing the remainder + # volume i.e. 340. + # Mathematically, 5 is the integer division of 5340 by 1000. The normal + # division gives 5.34 and the integer division takes the integer part of + # 5.34 which is 5 (5340 %/% 5). To find the remainder, we subtract from + # 5340 the product of (5340 %/% 1000)*1000.The result is + # 5340 - (5340 %/% 1000)*1000 = 5340 - 5*1000 = 5340 - 5000 = 340. There + # is a function for finding the remainder in R which is %%. + # Writing 5340 %% 1000 gives 340. + + # We start by placing the remainder in the original rows for large timebars. + # We identify these timebars in the original dataset and replace the volume + # (tbv) with the remainder of the division of tbv by the threshold. + + # To use the timebar in our example: + # + # minute dp tbv + # ------------------------------- + # 2019-04-02 04:53 53.0 5340. + # + # Replacing the volume (tbv) by the remainder gives: + # + # minute dp tbv + # ------------------------------- + # 2019-04-02 04:53 53.0 340. + + if (!is.null(largebnum)) { + + minutebars[largebnum, ]$tbv <- minutebars[largebnum, ]$tbv %% threshold + + # We will now replicate the time bar with the same price movement (dp), the + # same interval but with trade volume equal to threshold/x (x=10 in our case). + # The number of replications we need is the integer division of (tbv) by + # (threshold/x). n_rep stores these numbers of replication. + # This number for each of these rows is the integer division of (tbv) by + # (threshold/x), i.e., largebars$tbv %/% threshold + + n_rep <- x * largebars$tbv %/% threshold + + # All that is left now is to change the value of 'tbv' in 'largebars' to + # (threshold/x) and then replicate the timebars using the corresponding value + # in n_rep + + largebars$tbv <- threshold / x + + largebars <- largebars[rep(seq_len(nrow(largebars)), n_rep), ] + + rm(n_rep) + + # In our example, largebars will contain 50 replicated timebars of the + # following timebar: + # + # minute dp tbv + # ------------------------------- + # 2019-04-02 04:53 53.0 100. + # + # Since threshold is 1000, so threshold/x = 1000/10=100; so each of our + # timebars should have a volume of 100. Since we have already placed the + # remainder (340) in the original dataset, we just need to distribute the + # remaining volume 5000 into timebars with a volume of 100, which + # gives us 50 such timebars + + # The last step now is to add these rows to the main dataset and sort it by + # interval so that all timebars of the same interval will be neighbors. + + minutebars <- rbind(minutebars, largebars) + minutebars <- minutebars[order(minutebars$interval), ] + + # Finally, we reassign new identifiers to timebars (id) to make it easier to + # identify them in coming tasks + + minutebars$id <- seq.int(nrow(minutebars)) + rm(largebars, largebnum) + + } + + ############################################################################ + # STEP 4 : ASSIGNING T-MINUTE TIME BARS INTO BUCKETS + ############################################################################ + + # -------------------------------------------------------------------------- + # USER MESSAGE + # -------------------------------------------------------------------------- + + ux$show(c= verbose, m = vpin_ms$step5) + + # -------------------------------------------------------------------------- + # IV.1 ASSIGN A BUCKET TO EACH TIMEBAR | FIND EXCESS VOLUME FOR EACH OF THEM + # -------------------------------------------------------------------------- + + # Find the cumulative volume (runvol) over all minutebars + + minutebars$runvol <- cumsum(minutebars$tbv) + + # Find the bucket to which each timebar belongs, using integer division by the + # volume bucket size (vbs). If vbs = 100 and the cumulative volume is 70; it + # means that we have not yet filled the first bucket so the timebar should + # belong to bucket 1. We do that using the integer division 70/100 which + # gives 0 and to which we add 1 to obtain bucket.In general the bucket size + # is obtained by integer-dividing the cumulative volume (runvol) by vbs and + # adding +1. If the cumulative volume is 472, we know that it must be in + # bucket 5. Indeed, the formula gives us bucket 5 since 472%/%100+1=4+1 = 5 + + minutebars$bucket <- 1 + minutebars$runvol %/% vbs + + # The variable excess volume (exvol) calculates the excess volume after we + # have filled the bucket. As in the example above, if the timebar has + # cumulative volume runvol = 472. The excess volume should be 72; which is + # what is left after filling 4 buckets. Since we are in bucket 5; the excess + # volume can be obtained as 72 = runvol - (5-1)*vbs = 472 - 4*100 + # In general the formula: exvol = runvol - (bucket -1)*vbs + + minutebars$exvol <- minutebars$runvol - (minutebars$bucket - 1) * vbs + + ############################################################################ + # STEP 5 : BALANCING TIMEBARS AND ADJUSTING BUCKET SIZES TO VBS + ############################################################################ + + # -------------------------------------------------------------------------- + # USER MESSAGE + # -------------------------------------------------------------------------- + + ux$show(c= verbose, m = vpin_ms$step6) + + # -------------------------------------------------------------------------- + # TASK DESCRIPTION + # -------------------------------------------------------------------------- + + # We will give attention here to timebars that occur between buckets, that is + # have volume that belongs to two buckets at the same time. Assume vbs=100 and + # we have the following: + # + # interval dp tbv runvol bucket exvol + # -------------------------------------------------------- + # 2019-04-02 04:52 14.0 50. 90 1 90 + # 2019-04-02 04:53 53.0 30. 120 2 20 + # + # The timebar of the interval "2019-04-02 04:52" has a tbv of 30. A volume of + # 10 belongs to bucket 1 that used to have 90 and needs 10 to reach vbs=100; + # and a volume of 20 belongs to bucket 2 (which we can find in the excess + # volume exvol). Basically, for each first timebar of a bucket, the volume + # that belongs to the previous bucket is tbv - exvol (30-20=10 in our + # example) and the volume that belongs to the current bucket is exvol (20 in + # our example.). + # + # In order to have balanced buckets, we want our code to 'divide' the first + # timebar of bucket 2 into 'identical' timebars one containing a volume of + # 10 and belonging to bucket 1 and the other containing a volume of 20 and + # belonging to bucket 2. The output shall look like. + # + # interval dp tbv runvol bucket exvol + # -------------------------------------------------------- + # 2019-04-02 04:52 14.0 50. 90 1 90 + # 2019-04-02 04:53 53.0 10. 120 1 100 + # 2019-04-02 04:53 53.0 20. 120 2 20 + + # -------------------------------------------------------------------------- + # V.1 FINDING THE FIRST TIMEBAR IN EACH BUCKET + # -------------------------------------------------------------------------- + + # Find the first timebar in each bucket, we will have to ignore the first + # timebar of bucket 1 as it does not share trade volume with a previous bucket + + xtrarows <- + aggregate(. ~ bucket, subset(minutebars, bucket != 1), FUN = head, 1) + + # In our example, xtrarows should contain the following timebar + # + # interval dp tbv runvol bucket exvol + # -------------------------------------------------------- + # 2019-04-02 04:53 53.0 30. 120 2 120 + + # -------------------------------------------------------------------------- + # V.2 CHANGE THE VOLUME OF FIRST TIMEBAR IN EACH BUCKET TO EXVOL + # -------------------------------------------------------------------------- + + # In order to change any row in the main dataset, we need the row identifier + # (id). For the timebars that we have extracted in the dataframe xtrarows, we + # find their identifiers and store them in the vector xtraNum + + xtrarnum <- which(minutebars$id %in% xtrarows$id) + + # Now that we have found all the identifiers; for each minutebar having an + # identifier in xtrarnum, change the value volume (tbv) to the value of excess + # volume (exvol). + + minutebars[xtrarnum, "tbv"] <- minutebars[xtrarnum, "exvol"] + + # -------------------------------------------------------------------------- + # V.3 CREATE LAST MINUTEBAR IN EACH BUCKET TO REACH VBS + # -------------------------------------------------------------------------- + + # In the task description, we have established that we need to add a replicate + # of the first timebar of each bucket to the previous bucket and containing + # the volume equal to tbv - exvol. Review the task description if this is not + # clear. All such first timebars already exist in the dataframe xtrarows. We + # will just change the bucket number to the one before it and set tbv to + # tbv-exvol. + + xtrarows$tbv <- xtrarows$tbv - xtrarows$exvol + xtrarows$bucket <- xtrarows$bucket - 1 + + # -------------------------------------------------------------------------- + # V.4 CALCULATE ACCUMULATED BUCKET VOLUME FOR THE ORIGINAL DATASET + # -------------------------------------------------------------------------- + + # We add the replicated timebars in xtrarows to the main dataset + # 'minutebars' and sort it by interval and by bucket so we have all bucket + # minutebars next to one another. + + xtrarows$interval <- as.POSIXct(xtrarows$interval, + origin = "1970-01-01", tz = "") + xtrarows <- xtrarows[c("interval", "dp", "tbv", "id", "runvol", "bucket", + "exvol")] + + minutebars <- rbind(minutebars, xtrarows) + minutebars <- minutebars[order(minutebars$interval, minutebars$bucket), ] + + # We calculate accumulated bucket volume which is the exvol for the new values + # of tbv + + minutebars$runvol <- cumsum(minutebars$tbv) + minutebars$exvol <- minutebars$runvol - (minutebars$bucket - 1) * vbs + + rm(xtrarnum, xtrarows) + + ############################################################################ + # STEP 6 : CALCULATING AGGREGATE BUCKET DATA + ############################################################################ + + # -------------------------------------------------------------------------- + # USER MESSAGE + # -------------------------------------------------------------------------- + + ux$show(c= verbose, m = vpin_ms$step7) + + # -------------------------------------------------------------------------- + # VI.1 ASSIGN N(0, 1) PROB. TO EACH TIMEBAR AND CALCULATE BUY/SELL VOLUMES + # -------------------------------------------------------------------------- + + # Use the cdf of N(0, 1) to calculate zb = Z(dp/sdp) and zs = 1- Z(dp/sdp) + # The variable zb calculates the probability that the current price change + # normalized by standard deviation (dp/sdp) corresponds to timebar with + # buyer-initiated transactions, while zs calculates the probability that + # the current price change normalized by standard deviation (dp/sdp) + # corresponds to timebar with seller-initiated transactions. + + + minutebars$zb <- pnorm(minutebars$dp / sdp) + minutebars$zs <- 1 - pnorm(minutebars$dp / sdp) + # Calculate Buy Volume (bvol) and Sell volume (svol) by multiplying timebar's + # volume (tbv) by the corresponding probabilities zb and zs. + minutebars$bvol <- minutebars$tbv * minutebars$zb + minutebars$svol <- minutebars$tbv * minutebars$zs + minutebars <- minutebars[which(minutebars$tbv > 0), ] + # -------------------------------------------------------------------------- + # VI.2 CALCULATING AGGREGATE BUCKET DATA + # -------------------------------------------------------------------------- + + # Aggregate variables + # ++++++++++++++++++++ + # agg.bvol : sum of buy volume (bvol) per bucket + # agg.svol : sum of sell volume (svol) per bucket + # OI : the difference between agg.bvol and agg.svol + # init.time : the first timebar in the bucket + # final.time : the last timebar in the bucket + # +++++++++++++++++++ + bucketdata <- setNames( + aggregate(cbind(bvol, svol) ~ bucket, minutebars, sum), + c("bucket", "agg.bvol", "agg.svol")) + bucketdata$aoi <- abs(bucketdata$agg.bvol - bucketdata$agg.svol) + bucketdata$starttime <- aggregate(interval ~ bucket, + minutebars, head, 1)$interval + bucketdata$endtime <- aggregate(interval ~ bucket, + minutebars, tail, 1)$interval + bucketdata$duration <- as.numeric( + difftime(bucketdata$endtime, bucketdata$starttime, units = "secs")) + zero_duration_percentage <- sum(bucketdata$duration == 0) / nrow(bucketdata) * 100 + if (zero_duration_percentage > 5) { + warning(sprintf("More than 5%% of duration values are zero, reduce the timebar duration or increase the bucket size.")) + } + + ############################################################################ + # STEP 7 : CALCULATING iVPIN VECTOR FOLLOWING KE 2017 + ############################################################################ + + # -------------------------------------------------------------------------- + # USER MESSAGE + # -------------------------------------------------------------------------- + + ux$show(c= verbose, m = vpin_ms$step8) + + # -------------------------------------------------------------------------- + # TASK DESCRIPTION + # -------------------------------------------------------------------------- + + # The calculation of the VPIN vector uses + the vector of order imbalances + # (OI), the sample length (samplength) and the volume bucket size (vbs). + # Assume for now that samplength is 50. The formula for the first VPIN + # observation is sum(OI, i=1 to 50)/(50*vbs). + # The formula for the 5th VPIN observation is sum(OI, i=5 to 54)/(50*vbs). + + # Note that sum(OI, i=5 to 54) = sum(OI, i=1 to 54) - sum(OI, i=1 to 4) + # The first one is the cumulative sum at 54 and the second one is cumulative + # sum at 4. So sum(OI, i=5 to 54) = cumsum[54]-cum[4]. + # The general formula is sum(OI, i=n to 50+n) = cumsum[50+n]-cum[n]. + # + # So we can calculate VPIN for the bucket 50, we shift the cumsum by 1 + # position. We calculate first the cumulative sum for OI and then use the + # formula to find the value of VPIN + + # -------------------------------------------------------------------------- + # VII.1 CALCULATING VPIN VECTOR + # -------------------------------------------------------------------------- + + # Calculate the cumulative sum of OI: cumoi + + if (nrow(bucketdata) < samplength) { + + estimatevpin@success <- FALSE + + estimatevpin@errorMessage <- vpin_err$largesamplength + + ux$show(c= verbose, m = vpin_ms$aborted) + + ux$show(ux$line()) + + ux$stopnow(m = vpin_err$largesamplength, s = vpin_err$fn) + + } + + + # -------------------------------------------------------------------------- + # VII.2 CORRECTING THE DURATION VECTOR + # -------------------------------------------------------------------------- + + corduration <- function(etime, stime, duration) { + + days <- round(as.numeric(difftime(etime, stime, units = "days"))) + + duration <- as.numeric(duration) + + if (days > 0) { + dseconds <- (days - 1) * 3600 * 24 + (24 - tradinghours) * 3600 + return(duration - dseconds) + } + return(duration) + } + + if (tradinghours > 0 & tradinghours < 24) { + bucketdata$duration <- apply(bucketdata, 1, function(x) + corduration(x[6], x[5], x[10])) + } + print(nrow(data)) + # Normalize the duration to be centered around 1 + mean_duration <- mean(bucketdata$duration) + bucketdata$duration <- bucketdata$duration / mean_duration + bucketdata$duration <- ifelse(bucketdata$duration == 0, 0.1, bucketdata$duration) + + compute_log_likelihood <- function(params, t, Vb, Vs) { + alpha <- sigmoid(params[1]) + delta <- sigmoid(params[2]) + mu <- params[3] ^ 2 + eps <- params[4] ^ 2 + + e_1 <- log(alpha * delta) + Vb * log(eps) + Vs * log(eps + mu) - (2 * eps + mu) * t + e_2 <- log(alpha * (1 - delta)) + Vb * log(eps + mu) + Vs * log(eps) - (2 * eps + mu) * t + e_3 <- log(1 - alpha) + Vb * log(eps) + Vs * log(eps) - 2 * eps * t + e_max <- pmax(e_1, e_2, e_3) + + log_likelihood <- -sum(log(exp(e_1 - e_max) + exp(e_2 - e_max) + exp(e_3 - e_max)) + e_max) + return(log_likelihood) + } + + # Compute iVPIN with a rolling window + + start_index <- samplength + 1 + + " + Easley et al. (2012b) derive the VPIN estimator based on the argument of two moment conditions, + E[|VτB - VτS|] ≈ αμ and E[|VτB + VτS|] = 2ε + αμ, from the Poisson processes. + According to Ke et al. (2017), the two moment conditions should instead be expressed as + E[|VτB-VτS||tτ;θ]≈ αμtτ and E[|VτB + VτS|tτ; θ ] = (2ε + αμ)tτ, + " + total_arrival_rate <- (bucketdata$agg.bvol + bucketdata$agg.svol) / bucketdata$duration + informed_arrival <- abs(bucketdata$agg.bvol - bucketdata$agg.svol) / bucketdata$duration + + t <- bucketdata$duration + Vb <- bucketdata$agg.bvol + Vs <- bucketdata$agg.svol + + ivpin <- numeric(nrow(bucketdata)) # Ensure ivpin is initialized + + for (i in start_index:nrow(bucketdata)) { + j <- i - samplength + parms <- rep(-Inf, 4) # alpha, delta, mu, eps + + log_lik <- Inf + flag <- -Inf + + best_log_likelihood <- Inf + exit_flag <- FALSE + + if (i == start_index || i %% 10 == 0) { + for (alpha_init in seq(0.1, 0.9, by = 0.2)) { + for (delta_init in seq(0.1, 0.9, by = 0.2)) { + mu_init <- mean(informed_arrival[j:i]) / alpha_init + eps_init <- mean(abs(total_arrival_rate[j:i] - informed_arrival[j:i])) / 2 + + initial_guess <- c(logit(max(min(alpha_init, 0.999), 0.001)), + logit(max(min(delta_init, 0.999), 0.001)), + sqrt(mu_init), + sqrt(eps_init)) + tryCatch({ + result <- optim(initial_guess, compute_log_likelihood, t = t[j:i], Vb = Vb[j:i], Vs = Vs[j:i], method = "BFGS") + }, error = function(e) { + conditionMessage(e) + }) + + if (result$value < best_log_likelihood && is.finite(result$value)) { + best_params <- result$par + best_log_likelihood <- result$value + exit_flag <- TRUE + } + } + } + } else { + initial_guess <- c(logit(best_params[1]), logit(best_params[2]), sqrt(best_params[3]), sqrt(best_params[4])) + tryCatch({ + result <- optim(initial_guess, compute_log_likelihood, t = t[j:i], Vb = Vb[j:i], Vs = Vs[j:i], method = "BFGS") + }, error = function(e) { + conditionMessage(e) + }) + + if (result$value < best_log_likelihood && is.finite(result$value)) { + best_params <- result$par + exit_flag <- TRUE + } else { + for (alpha_init in seq(0.1, 0.9, by = 0.2)) { + for (delta_init in seq(0.1, 0.9, by = 0.2)) { + mu_init <- informed_arrival[i] / alpha_init + eps_init <- abs(total_arrival_rate[i] - informed_arrival[i]) / 2 + initial_guess <- c(logit(max(min(alpha_init, 0.999), 0.001)), + logit(max(min(delta_init, 0.999), 0.001)), + sqrt(mu_init), + sqrt(eps_init)) + tryCatch({ + result <- optim(initial_guess, compute_log_likelihood, t = t[j:i], Vb = Vb[j:i], Vs = Vs[j:i], method = "BFGS") + }, error = function(e) { + conditionMessage(e) + }) + if (result$value < best_log_likelihood && is.finite(result$value)) { + best_params <- result$par + best_log_likelihood <- result$value + exit_flag <- TRUE + } + } + } + } + } + + if (exists("best_params")) { + best_params[1:2] <- sigmoid(best_params[1:2]) + best_params[3:4] <- best_params[3:4] ^ 2 + ivpin_estimate <- best_params[1] * best_params[3] / (2 * best_params[4] + best_params[3]) + ivpin[i] <- ivpin_estimate + } else { + cat("No valid parameters found at index", i, "\n") + ivpin[i] <- NA + } + } + + return(ivpin) +} From 3c8d0bff66a77205f68ab6f82ef412b86785c631 Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Sat, 22 Jun 2024 08:49:18 -1000 Subject: [PATCH 02/19] Update output_classes.R add ivpin output class --- R/output_classes.R | 90 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/R/output_classes.R b/R/output_classes.R index e956118..87814b6 100644 --- a/R/output_classes.R +++ b/R/output_classes.R @@ -302,6 +302,96 @@ setMethod( ) + +# ---------------------------------------------------------------------------- # +# IVPIN Classes # +# ---------------------------------------------------------------------------- # + +#' @title IVPIN estimation results +#' +#' @description The class \code{estimate.ivpin} is a blueprint for `S4` objects +#' that store the results of the `IVPIN` estimation method using the function +#' \code{ivpin()}. +#' @slot success (`logical`) returns the value \code{TRUE} when the estimation +#' has succeeded, \code{FALSE} otherwise. +#' @slot errorMessage (`character`) returns an error message if the `IVPIN` +#' estimation has failed, and is empty otherwise. +#' @slot parameters (`numeric`) returns a numeric vector of estimation +#' parameters (tbSize, buckets, samplength, VBS, #days), where `tbSize` is the +#' size of timebars (in seconds); `buckets` is the number of buckets per average +#' volume day; `VBS` is Volume Bucket Size (daily average volume/number of +#' buckets `buckets`); `samplength` is the length of the window used to estimate +#' `IVPIN`; and `#days` is the number of days in the dataset. +#' @slot bucketdata (`dataframe`) returns the dataframe containing detailed +#' information about buckets. +#' @slot ivpin (`numeric`) returns the vector of the volume-synchronized +#' probabilities of informed trading. +#' @slot dailyivpin (`dataframe`) returns the daily `IVPIN` values. Two +#' variants are provided for any given day: \code{divpin} corresponds to +#' the unweighted average of ivpin values, and \code{divpin.weighted} +#' corresponds to the average of ivpin values weighted by bucket duration. +#' @slot runningtime (`numeric`) returns the running time of the `IVPIN` +#' estimation in seconds. +setClass( + "estimate.ivpin", + slots = list( + success = "logical", errorMessage = "character", parameters = "numeric", + bucketdata = "data.frame", ivpin = "numeric", dailyivpin = "data.frame", + runningtime = "numeric" + ), + prototype = list( + success = TRUE, errorMessage = "", parameters = 0, + bucketdata = data.frame(), ivpin = 0, dailyivpin = data.frame(), + runningtime = 0 + ) +) + +#' @rdname estimate.ivpin-class +#' @description The function show() displays a description of the +#' estimate.ivpin object: descriptive statistics of the `IVPIN` variable, +#' the set of relevant parameters, and the running time. +#' @param object an object of class \code{estimate.ivpin} +setMethod( + "show", signature(object = "estimate.ivpin"), + function(object) { + + # load the digits for display of decimals + digits <- getOption("PIN-digits") + + interface <- uiclasses$ivpin(object) + + ux$show(m = interface$line) + ux$show(m = interface$outcome) + ux$show(m = interface$line) + ux$show(m = interface$ivpinfunctions) + + if (object@success) { + + ux$show(m = interface$badge, skip = FALSE) + + xsummary <- interface$ivpinsummary + + show(knitr::kable(xsummary, "simple", padding = 1L, align = "c", + caption = interface$summarycaption + )) + + xparams <- interface$ivpinparams + + show(knitr::kable(xparams, "simple", padding = 1L, align = "c", + caption = interface$paramscaption + )) + + } else { + + ux$show(m = interface$error, warning = TRUE) + } + + ux$show(m = interface$runningtime) + + } +) + + # ---------------------------------------------------------------------------- # # MPIN Classes # # ---------------------------------------------------------------------------- # From acef763e215eca13f004d82db2a16b62e9948792 Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Sat, 22 Jun 2024 08:50:37 -1000 Subject: [PATCH 03/19] Update DESCRIPTION add lubridate to imports --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 43a659b..27c869d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,7 +18,7 @@ LazyDataCompression: xz RoxygenNote: 7.2.1 Roxygen: list(markdown = TRUE) VignetteBuilder: knitr -Imports: Rdpack, knitr, methods, skellam, nloptr, furrr, future, dplyr, rmarkdown, coda +Imports: Rdpack, knitr, methods, skellam, nloptr, furrr, future, dplyr, rmarkdown, coda, lubridate RdMacros: Rdpack Depends: R (>= 3.5.0) Suggests: fansi, htmltools From c8449e5f218dfddb854d806c04eed5dd2d9958fa Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Sat, 22 Jun 2024 08:52:41 -1000 Subject: [PATCH 04/19] Add files via upload --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index 9e68a3e..ae9ca17 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,6 +21,7 @@ export(initials_mpin) export(initials_pin_ea) export(initials_pin_gwj) export(initials_pin_yz) +export(ivpin) export(mpin_ecm) export(mpin_ml) export(pin) From f1ca0203b3e9cd20d4c3db65a8cd12ffc173effc Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Sat, 22 Jun 2024 14:25:51 -1000 Subject: [PATCH 05/19] Update model_ivpin.R --- R/model_ivpin.R | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/R/model_ivpin.R b/R/model_ivpin.R index 8e43c0d..300258f 100644 --- a/R/model_ivpin.R +++ b/R/model_ivpin.R @@ -181,7 +181,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # Initialize the local variables tbv <- vbs <- bucket <- NULL - estimatevpin <- new("estimate.vpin") + estimateivpin <- new("estimate.ivpin") time_on <- Sys.time() @@ -340,7 +340,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, names(params) <- c("tbSize", "buckets", "samplength", "VBS", "ndays") - estimatevpin@parameters <- params + estimateivpin@parameters <- params # -------------------------------------------------------------------------- # II.2 CALCULATE THE STANDARD DEVIATION OF DP (SDP) @@ -748,9 +748,9 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, if (nrow(bucketdata) < samplength) { - estimatevpin@success <- FALSE + estimateivpin@success <- FALSE - estimatevpin@errorMessage <- vpin_err$largesamplength + estimateivpin@errorMessage <- vpin_err$largesamplength ux$show(c= verbose, m = vpin_ms$aborted) @@ -900,6 +900,9 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, ivpin[i] <- NA } } + estimateivpin@ivpin <- ivpin + estimatevpin@runningtime <- ux$timediff(time_on, time_off) + ux$show(c= verbose, m = vpin_ms$complete) - return(ivpin) + return(estimateivpin) } From 726a2bcff9568884388b9e93679718c0bf549296 Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Sat, 22 Jun 2024 14:39:23 -1000 Subject: [PATCH 06/19] Update model_ivpin.R --- R/model_ivpin.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/model_ivpin.R b/R/model_ivpin.R index 300258f..97b8a3a 100644 --- a/R/model_ivpin.R +++ b/R/model_ivpin.R @@ -902,6 +902,10 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, } estimateivpin@ivpin <- ivpin estimatevpin@runningtime <- ux$timediff(time_on, time_off) + num_of_nan <- sum(is.na(ivpin)) + if (num_of_nan > 0) { + warning(sprintf("Optimization failed for %d buckets", num_of_nan)) + } ux$show(c= verbose, m = vpin_ms$complete) return(estimateivpin) From 615fe5683334fbdc92a430f0da0c6365e8f49b1d Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Sat, 22 Jun 2024 14:40:55 -1000 Subject: [PATCH 07/19] Update model_ivpin.R --- R/model_ivpin.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/model_ivpin.R b/R/model_ivpin.R index 97b8a3a..7153371 100644 --- a/R/model_ivpin.R +++ b/R/model_ivpin.R @@ -6,7 +6,7 @@ ## ## Purpose of script: ## Implement the estimation of the Intraday Volume-synchronized PIN -## measure (IVPIN). +## measure (IVPIN) of Ke et al. (2017). ## ## Author: ## Alexandre Borentain From 59a48269f97264a068919bc6b6c5682341981379 Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Sat, 22 Jun 2024 14:46:09 -1000 Subject: [PATCH 08/19] Update model_ivpin.R --- R/model_ivpin.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/model_ivpin.R b/R/model_ivpin.R index 7153371..11c2863 100644 --- a/R/model_ivpin.R +++ b/R/model_ivpin.R @@ -5,7 +5,7 @@ ## model_ivpin.R ## ## Purpose of script: -## Implement the estimation of the Intraday Volume-synchronized PIN +## Implement the estimation of the Improved Volume-synchronized Probability of Informed Trading ## measure (IVPIN) of Ke et al. (2017). ## ## Author: @@ -26,7 +26,7 @@ ## ++++++++++++++++++ ## ## ivpin(): -## Estimates the Intraday Volume-Synchronized Probability of Informed +## Estimates the Improved Volume-synchronized Probability of Informed Trading ## Trading. ## ## ++++++++++++++++++ From f8f89cf939b3d4c32fdaf00ccf77e5987430d672 Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Sat, 22 Jun 2024 18:50:32 -1000 Subject: [PATCH 09/19] Update model_ivpin.R --- R/model_ivpin.R | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/R/model_ivpin.R b/R/model_ivpin.R index 11c2863..ee78362 100644 --- a/R/model_ivpin.R +++ b/R/model_ivpin.R @@ -132,14 +132,6 @@ #' buckets <- estimate@bucketdata #' show(head(buckets, 10)) #' -#' # Store the daily IVPIN values (weighted and unweighted) in a dataframe -#' # 'dayivpin'. -#' -#' # Display the first 10 rows of the dataframe 'dayivpin'. -#' -#' dayivpin <- estimate@dailyvpin -#' show(head(dayivpin, 10)) -#' #' @export ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, tradinghours = 24, verbose = TRUE) { @@ -152,14 +144,6 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, @tradinghours : length of trading days - used to correct the durations of buckets. Default value is 24. " - sigmoid=function(x){ - y=1/(1+exp(x)) - return(y) - } - logit=function(y){ - x=log(1/y-1) - return(x) - } vpin_err <- uierrors$vpin() vpin_ms <- uix$vpin(timebarsize = timebarsize) @@ -787,7 +771,20 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, mean_duration <- mean(bucketdata$duration) bucketdata$duration <- bucketdata$duration / mean_duration bucketdata$duration <- ifelse(bucketdata$duration == 0, 0.1, bucketdata$duration) - + + # The sigmoid function + sigmoid=function(x){ + y=1/(1+exp(x)) + return(y) + } + + # The inverse sigmoid function + logit=function(y){ + x=log(1/y-1) + return(x) + } + + # The function to be optimized compute_log_likelihood <- function(params, t, Vb, Vs) { alpha <- sigmoid(params[1]) delta <- sigmoid(params[2]) From aee81fa23f6f5ca159a4932f2a6367bab6ae010d Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Mon, 1 Jul 2024 02:25:26 -1000 Subject: [PATCH 10/19] Update model_ivpin.R --- R/model_ivpin.R | 368 +++++++++++++++++++++++++----------------------- 1 file changed, 189 insertions(+), 179 deletions(-) diff --git a/R/model_ivpin.R b/R/model_ivpin.R index ee78362..6774967 100644 --- a/R/model_ivpin.R +++ b/R/model_ivpin.R @@ -388,6 +388,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # A variable 'id' is assigned to each timebar to easily identify them in # future tasks. + minutebars$id <- seq.int(nrow(minutebars)) # Define the precision factor (x) and calculate the threshold @@ -401,93 +402,139 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # them in the vector largebnum largebars <- subset(minutebars, tbv > threshold) + + # # -------------------------------------------------------------------------- + # # III.2 BREAK DOWN LARGE VOLUME TIMEBARS INTO SMALLER ONES + # # -------------------------------------------------------------------------- + + # We break down these large volume timebars into smaller timebars with a + # maximum volume equal to threshold. + # The code recursively splits large timebars into smaller ones and removes + # the original large timebars from the final dataset, until only + # timebars with volumes below the threshold remain. + + # If, for example, the initial minute bar with interval 10:00:00 - 10:01:00 + # has a total volume (tbv) of 2200, which exceeds the threshold of 1000, and + # is composed in the following trades: + + # trades price volume + # --------------------------------- + # 2023-06-30 10:00:05 1 500 + # 2023-06-30 10:00:10 1 600 + # 2023-06-30 10:00:20 1 700 + # 2023-06-30 10:00:30 1 400 + + # The first pass splits this initial minute bar into two 30-second intervals: - largebnum <- NULL - if (nrow(largebars) != 0) - largebnum <- which(minutebars$id %in% largebars$id) - - # -------------------------------------------------------------------------- - # III.2 BREAK DOWN LARGE VOLUME TIMEBARS INTO SMALLER ONES - # -------------------------------------------------------------------------- - - # We break down these large volume timebars into replicated timebars with a - # maximum volume equal to threshold. If, for example, the timebar has volume - # 5340 and threshold = 1000, we create five (5)'identical' timebars with a - # volume 1000 each and and one additional timebar containing the remainder - # volume i.e. 340. - # Mathematically, 5 is the integer division of 5340 by 1000. The normal - # division gives 5.34 and the integer division takes the integer part of - # 5.34 which is 5 (5340 %/% 5). To find the remainder, we subtract from - # 5340 the product of (5340 %/% 1000)*1000.The result is - # 5340 - (5340 %/% 1000)*1000 = 5340 - 5*1000 = 5340 - 5000 = 340. There - # is a function for finding the remainder in R which is %%. - # Writing 5340 %% 1000 gives 340. - - # We start by placing the remainder in the original rows for large timebars. - # We identify these timebars in the original dataset and replace the volume - # (tbv) with the remainder of the division of tbv by the threshold. + # interval dp tbv + # --------------------------------- + # 2023-06-30 10:00:00 0 1800 (first 3 trades) + # 2023-06-30 10:00:30 1 400 (fourth trade) - # To use the timebar in our example: + # The first 30-second interval with 1800 volume exceeds the threshold, + # so it will be split further into two 15-second intervals: # - # minute dp tbv - # ------------------------------- - # 2019-04-02 04:53 53.0 5340. - # - # Replacing the volume (tbv) by the remainder gives: - # - # minute dp tbv - # ------------------------------- - # 2019-04-02 04:53 53.0 340. - - if (!is.null(largebnum)) { - - minutebars[largebnum, ]$tbv <- minutebars[largebnum, ]$tbv %% threshold - - # We will now replicate the time bar with the same price movement (dp), the - # same interval but with trade volume equal to threshold/x (x=10 in our case). - # The number of replications we need is the integer division of (tbv) by - # (threshold/x). n_rep stores these numbers of replication. - # This number for each of these rows is the integer division of (tbv) by - # (threshold/x), i.e., largebars$tbv %/% threshold - - n_rep <- x * largebars$tbv %/% threshold - - # All that is left now is to change the value of 'tbv' in 'largebars' to - # (threshold/x) and then replicate the timebars using the corresponding value - # in n_rep - - largebars$tbv <- threshold / x - - largebars <- largebars[rep(seq_len(nrow(largebars)), n_rep), ] - - rm(n_rep) - - # In our example, largebars will contain 50 replicated timebars of the - # following timebar: - # - # minute dp tbv - # ------------------------------- - # 2019-04-02 04:53 53.0 100. - # - # Since threshold is 1000, so threshold/x = 1000/10=100; so each of our - # timebars should have a volume of 100. Since we have already placed the - # remainder (340) in the original dataset, we just need to distribute the - # remaining volume 5000 into timebars with a volume of 100, which - # gives us 50 such timebars + # interval dp tbv + # --------------------------------- + # 2023-06-30 10:00:00 0 1100 (first 2 trades) + # 2023-06-30 10:00:15 1 700 (third trade) + # 2023-06-30 10:00:30 1 400 (fourth trade) + + # The first 15-second interval with 1100 volume still exceeds the threshold, + # so it will be split further into two intervals: + + # interval dp tbv + # --------------------------------- + # 2023-06-30 10:00:00 0 500 (first trade) + # 2023-06-30 10:00:07.5 1 600 (second trade) + # 2023-06-30 10:00:30 1 400 (fourth trade) + + # The final timebars with acceptable volumes are: + + # interval dp tbv + # --------------------------------- + # 2023-06-30 10:00:00 0 500 (first trade) + # 2023-06-30 10:00:07.5 1 600 (second trade) + # 2023-06-30 10:00:15 1 700 (third trade) + # 2023-06-30 10:00:30 1 400 (last trade) - # The last step now is to add these rows to the main dataset and sort it by - # interval so that all timebars of the same interval will be neighbors. - minutebars <- rbind(minutebars, largebars) - minutebars <- minutebars[order(minutebars$interval), ] + ##### NEW CODE TO BREAK MINUTE BARS AND ADJUST FOR 0 DURATION - # Finally, we reassign new identifiers to timebars (id) to make it easier to - # identify them in coming tasks + # this funtion recursive function split_timebars that splits large timebars + # into smaller ones until they have the correct volume + split_timebars <- function(data, dataset, interval_duration, threshold) { + final_timebars <- data.frame() + + # Function to split the data recursively + recursive_split <- function(data, interval_duration) { + large_timebars <- subset(data, tbv > threshold) + + if (nrow(large_timebars) == 0) { + # If no large timebars, add the data to final timebars + final_timebars <<- rbind(final_timebars, data) + return() + } + + for (i in seq_len(nrow(large_timebars))) { + large_timebar <- large_timebars[i, ] + trades <- subset(dataset, interval == large_timebar$interval) + + interval_start <- as.POSIXct(large_timebar$interval, tz = "") + midpoint <- interval_start + interval_duration / 2 + + first_half_trades <- subset(trades, timestamp <= midpoint) + second_half_trades <- subset(trades, timestamp > midpoint) + + first_interval <- format(interval_start, "%Y-%m-%d %H:%M:%S") + second_interval <- format(midpoint, "%Y-%m-%d %H:%M:%S") + + first_half <- data.frame( + interval = first_interval, + dp = ifelse(nrow(first_half_trades) > 0, tail(first_half_trades$price, 1) - head(first_half_trades$price, 1), 0), + tbv = sum(first_half_trades$volume) + ) + + second_half <- data.frame( + interval = second_interval, + dp = ifelse(nrow(second_half_trades) > 0, tail(second_half_trades$price, 1) - head(second_half_trades$price, 1), 0), + tbv = sum(second_half_trades$volume) + ) + + # Call the recursive split on the new intervals if their volumes are too large + if (first_half$tbv > threshold) { + recursive_split(first_half, interval_duration / 2) + } else { + final_timebars <<- rbind(final_timebars, first_half) + } + + if (second_half$tbv > threshold) { + recursive_split(second_half, interval_duration / 2) + } else { + final_timebars <<- rbind(final_timebars, second_half) + } + + # Remove the large time bar being split from the final_timebars + final_timebars <<- final_timebars[final_timebars$interval != large_timebar$interval, ] + } + } + + # Start the recursive split with the initial data and interval duration + recursive_split(data, interval_duration) + return(final_timebars) + } + + largebars <- subset(minutebars, tbv > threshold) + if (nrow(largebars) > 0) { + split_bars <- split_timebars(largebars, dataset, timebarsize, threshold) + minutebars <- minutebars[-which(minutebars$tbv > threshold), ] + minutebars <- rbind(minutebars, split_bars) + minutebars <- minutebars[order(as.POSIXct(minutebars$interval, format = "%Y-%m-%d %H:%M:%S")), ] + } - minutebars$id <- seq.int(nrow(minutebars)) - rm(largebars, largebnum) + # Reassign new identifiers to time bars + minutebars$id <- seq.int(nrow(minutebars)) - } ############################################################################ # STEP 4 : ASSIGNING T-MINUTE TIME BARS INTO BUCKETS @@ -709,41 +756,15 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # TASK DESCRIPTION # -------------------------------------------------------------------------- - # The calculation of the VPIN vector uses + the vector of order imbalances - # (OI), the sample length (samplength) and the volume bucket size (vbs). - # Assume for now that samplength is 50. The formula for the first VPIN - # observation is sum(OI, i=1 to 50)/(50*vbs). - # The formula for the 5th VPIN observation is sum(OI, i=5 to 54)/(50*vbs). - - # Note that sum(OI, i=5 to 54) = sum(OI, i=1 to 54) - sum(OI, i=1 to 4) - # The first one is the cumulative sum at 54 and the second one is cumulative - # sum at 4. So sum(OI, i=5 to 54) = cumsum[54]-cum[4]. - # The general formula is sum(OI, i=n to 50+n) = cumsum[50+n]-cum[n]. - # - # So we can calculate VPIN for the bucket 50, we shift the cumsum by 1 - # position. We calculate first the cumulative sum for OI and then use the - # formula to find the value of VPIN - - # -------------------------------------------------------------------------- - # VII.1 CALCULATING VPIN VECTOR - # -------------------------------------------------------------------------- - - # Calculate the cumulative sum of OI: cumoi - - if (nrow(bucketdata) < samplength) { - - estimateivpin@success <- FALSE - - estimateivpin@errorMessage <- vpin_err$largesamplength - - ux$show(c= verbose, m = vpin_ms$aborted) - - ux$show(ux$line()) - - ux$stopnow(m = vpin_err$largesamplength, s = vpin_err$fn) - - } + # We calculate IVPIN from Ke et al. 2017 from the following formula + # ivpin=(alpha*mu)/(2*epsilon+mu) where: + # -alpha is the pobability of information event occurrence + # -Mu is the informed traders’ arrival rate + # -Epsilon is the uninformed traders’ arrival rate + # -Delta is the probabilities of bad news + # We estimate IVPIN on a rolling window of length samplelength buckets. We estimate the + # parameters alpha, mu, epsilon, delta by MLE of the function in compute_log_likelihood. # -------------------------------------------------------------------------- # VII.2 CORRECTING THE DURATION VECTOR @@ -767,22 +788,18 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, corduration(x[6], x[5], x[10])) } print(nrow(data)) + # Normalize the duration to be centered around 1 mean_duration <- mean(bucketdata$duration) bucketdata$duration <- bucketdata$duration / mean_duration - bucketdata$duration <- ifelse(bucketdata$duration == 0, 0.1, bucketdata$duration) + bucketdata$duration <- ifelse(bucketdata$duration == 0, 1e-6, bucketdata$duration) - # The sigmoid function - sigmoid=function(x){ - y=1/(1+exp(x)) - return(y) - } + # The sigmoid function + sigmoid <- function(x) 1 / (1 + exp(x)) # The inverse sigmoid function - logit=function(y){ - x=log(1/y-1) - return(x) - } + logit <- function(y) log(1 / y - 1) + # The function to be optimized compute_log_likelihood <- function(params, t, Vb, Vs) { @@ -800,16 +817,22 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, return(log_likelihood) } - # Compute iVPIN with a rolling window - + # We start the computation of IVPIN at the samplength th index start_index <- samplength + 1 - " - Easley et al. (2012b) derive the VPIN estimator based on the argument of two moment conditions, - E[|VτB - VτS|] ≈ αμ and E[|VτB + VτS|] = 2ε + αμ, from the Poisson processes. - According to Ke et al. (2017), the two moment conditions should instead be expressed as - E[|VτB-VτS||tτ;θ]≈ αμtτ and E[|VτB + VτS|tτ; θ ] = (2ε + αμ)tτ, - " + # Easley et al. (2012b) derive the VPIN estimator based on two moment conditions from Poisson processes: + # E[|VB - VS|] ≈ alpha*mu and E[|VB + VS|] = 2*epsilon + alpha*mu. + # Ke et al. (2017) suggest that these conditions should be expressed as: + # E[|VB - VS||t; theta] ≈ alpha*mu*t and E[|VB + VS||t; theta] = (2*epsilon + alpha*mu)*t. + # These equations correspond to Equations (3) on page 364 of Ke et al. (2017) . + # + # In our implementation: + # - The variable `total_arrival_rate` is calculated as |VB + VS|/t. Its average (expected value) would, therefore, represent 2*epsilon + alpha*mu. + # - The variable `informed_arrival_rate` is calculated as |VB - VS|/t. Its average (expected value) would, therefore, approximate alpha*mu. + # + # This approximation allows us to estimate mu by dividing `informed_arrival_rate` by an estimated alpha. + # Once mu is estimated, we can determine epsilon using the equation for `total_arrival_rate`. + total_arrival_rate <- (bucketdata$agg.bvol + bucketdata$agg.svol) / bucketdata$duration informed_arrival <- abs(bucketdata$agg.bvol - bucketdata$agg.svol) / bucketdata$duration @@ -819,39 +842,45 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, ivpin <- numeric(nrow(bucketdata)) # Ensure ivpin is initialized + perform_grid_search <- function(best_log_likelihood, exit_flag, i, j) { + + for (alpha_init in seq(0.1, 0.9, by = 0.2)) { + for (delta_init in seq(0.1, 0.9, by = 0.2)) { + mu_init <- mean(informed_arrival[j:i]) / alpha_init + eps_init <- mean(abs(total_arrival_rate[j:i] - informed_arrival[j:i])) / 2 + + initial_guess <- c(logit(max(min(alpha_init, 0.999), 0.001)), + logit(max(min(delta_init, 0.999), 0.001)), + sqrt(mu_init), + sqrt(eps_init)) + tryCatch({ + result <- optim(initial_guess, compute_log_likelihood, t = t[j:i], Vb = Vb[j:i], Vs = Vs[j:i], method = "BFGS") + }, error = function(e) { + conditionMessage(e) + }) + + if (result$value < best_log_likelihood && is.finite(result$value)) { + best_params <- result$par + best_log_likelihood <- result$value + exit_flag <- TRUE + } + } + } + return best_params, best_log_likelihood, exit_flag + } + + # Compute iVPIN with a rolling window by iterating over each bucket. We perfom a grid search to find the optimal initial parameters of the + # first bucket, and for each subsequent bucket, we use the previous bucket's initial parametes as the current initial paramenters. + for (i in start_index:nrow(bucketdata)) { j <- i - samplength parms <- rep(-Inf, 4) # alpha, delta, mu, eps - - log_lik <- Inf - flag <- -Inf - + best_log_likelihood <- Inf exit_flag <- FALSE - if (i == start_index || i %% 10 == 0) { - for (alpha_init in seq(0.1, 0.9, by = 0.2)) { - for (delta_init in seq(0.1, 0.9, by = 0.2)) { - mu_init <- mean(informed_arrival[j:i]) / alpha_init - eps_init <- mean(abs(total_arrival_rate[j:i] - informed_arrival[j:i])) / 2 - - initial_guess <- c(logit(max(min(alpha_init, 0.999), 0.001)), - logit(max(min(delta_init, 0.999), 0.001)), - sqrt(mu_init), - sqrt(eps_init)) - tryCatch({ - result <- optim(initial_guess, compute_log_likelihood, t = t[j:i], Vb = Vb[j:i], Vs = Vs[j:i], method = "BFGS") - }, error = function(e) { - conditionMessage(e) - }) - - if (result$value < best_log_likelihood && is.finite(result$value)) { - best_params <- result$par - best_log_likelihood <- result$value - exit_flag <- TRUE - } - } - } + if (i == start_index) { + best_params, best_log_likelihood, exit_flag = perform_grid_search(best_log_likelihood, exit_flag, i, j) } else { initial_guess <- c(logit(best_params[1]), logit(best_params[2]), sqrt(best_params[3]), sqrt(best_params[4])) tryCatch({ @@ -864,30 +893,11 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, best_params <- result$par exit_flag <- TRUE } else { - for (alpha_init in seq(0.1, 0.9, by = 0.2)) { - for (delta_init in seq(0.1, 0.9, by = 0.2)) { - mu_init <- informed_arrival[i] / alpha_init - eps_init <- abs(total_arrival_rate[i] - informed_arrival[i]) / 2 - initial_guess <- c(logit(max(min(alpha_init, 0.999), 0.001)), - logit(max(min(delta_init, 0.999), 0.001)), - sqrt(mu_init), - sqrt(eps_init)) - tryCatch({ - result <- optim(initial_guess, compute_log_likelihood, t = t[j:i], Vb = Vb[j:i], Vs = Vs[j:i], method = "BFGS") - }, error = function(e) { - conditionMessage(e) - }) - if (result$value < best_log_likelihood && is.finite(result$value)) { - best_params <- result$par - best_log_likelihood <- result$value - exit_flag <- TRUE - } - } - } + best_params, best_log_likelihood, exit_flag = perform_grid_search(best_log_likelihood, exit_flag, i, j) } } - if (exists("best_params")) { + if (exit_flag == TRUE) { best_params[1:2] <- sigmoid(best_params[1:2]) best_params[3:4] <- best_params[3:4] ^ 2 ivpin_estimate <- best_params[1] * best_params[3] / (2 * best_params[4] + best_params[3]) From 43d04b2e75a6e0027ff6fdc7b4bb1cc2c5912bde Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Wed, 3 Jul 2024 01:52:35 -1000 Subject: [PATCH 11/19] Update model_ivpin.R --- R/model_ivpin.R | 58 +++++++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/R/model_ivpin.R b/R/model_ivpin.R index 6774967..4228f84 100644 --- a/R/model_ivpin.R +++ b/R/model_ivpin.R @@ -817,9 +817,6 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, return(log_likelihood) } - # We start the computation of IVPIN at the samplength th index - start_index <- samplength + 1 - # Easley et al. (2012b) derive the VPIN estimator based on two moment conditions from Poisson processes: # E[|VB - VS|] ≈ alpha*mu and E[|VB + VS|] = 2*epsilon + alpha*mu. # Ke et al. (2017) suggest that these conditions should be expressed as: @@ -842,8 +839,9 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, ivpin <- numeric(nrow(bucketdata)) # Ensure ivpin is initialized - perform_grid_search <- function(best_log_likelihood, exit_flag, i, j) { - + perform_grid_search <- function(best_log_likelihood, exit_flag, i, j, informed_arrival, total_arrival_rate, t, Vb, Vs) { + best_params <- NULL + for (alpha_init in seq(0.1, 0.9, by = 0.2)) { for (delta_init in seq(0.1, 0.9, by = 0.2)) { mu_init <- mean(informed_arrival[j:i]) / alpha_init @@ -855,46 +853,57 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, sqrt(eps_init)) tryCatch({ result <- optim(initial_guess, compute_log_likelihood, t = t[j:i], Vb = Vb[j:i], Vs = Vs[j:i], method = "BFGS") + if (result$value < best_log_likelihood && is.finite(result$value)) { + best_params <- result$par + best_log_likelihood <- result$value + exit_flag <- TRUE + } }, error = function(e) { conditionMessage(e) }) - - if (result$value < best_log_likelihood && is.finite(result$value)) { - best_params <- result$par - best_log_likelihood <- result$value - exit_flag <- TRUE - } } } - return best_params, best_log_likelihood, exit_flag + + return(list(best_params, best_log_likelihood, exit_flag)) } - + + # We start the computation of IVPIN at the samplength th index + start_index <- samplength + 1 + # Compute iVPIN with a rolling window by iterating over each bucket. We perfom a grid search to find the optimal initial parameters of the - # first bucket, and for each subsequent bucket, we use the previous bucket's initial parametes as the current initial paramenters. - + # first bucket, and for each subsequent bucket, we use the previous bucket's initial parametes as the current initial paramenters. for (i in start_index:nrow(bucketdata)) { j <- i - samplength parms <- rep(-Inf, 4) # alpha, delta, mu, eps - + best_log_likelihood <- Inf exit_flag <- FALSE if (i == start_index) { - best_params, best_log_likelihood, exit_flag = perform_grid_search(best_log_likelihood, exit_flag, i, j) + result <- perform_grid_search(best_log_likelihood, exit_flag, i, j, informed_arrival, total_arrival_rate, t, Vb, Vs) + best_params <- result[[1]] + best_log_likelihood <- result[[2]] + exit_flag <- result[[3]] } else { initial_guess <- c(logit(best_params[1]), logit(best_params[2]), sqrt(best_params[3]), sqrt(best_params[4])) tryCatch({ result <- optim(initial_guess, compute_log_likelihood, t = t[j:i], Vb = Vb[j:i], Vs = Vs[j:i], method = "BFGS") + if (result$value < best_log_likelihood && is.finite(result$value)) { + best_params <- result$par + exit_flag <- TRUE + } else { + result <- perform_grid_search(best_log_likelihood, exit_flag, i, j, informed_arrival, total_arrival_rate, t, Vb, Vs) + best_params <- result[[1]] + best_log_likelihood <- result[[2]] + exit_flag <- result[[3]] + } }, error = function(e) { conditionMessage(e) + result <- perform_grid_search(best_log_likelihood, exit_flag, i, j, informed_arrival, total_arrival_rate, t, Vb, Vs) + best_params <- result[[1]] + best_log_likelihood <- result[[2]] + exit_flag <- result[[3]] }) - - if (result$value < best_log_likelihood && is.finite(result$value)) { - best_params <- result$par - exit_flag <- TRUE - } else { - best_params, best_log_likelihood, exit_flag = perform_grid_search(best_log_likelihood, exit_flag, i, j) - } } if (exit_flag == TRUE) { @@ -907,6 +916,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, ivpin[i] <- NA } } + estimateivpin@ivpin <- ivpin estimatevpin@runningtime <- ux$timediff(time_on, time_off) num_of_nan <- sum(is.na(ivpin)) From 11be6863edb25d2fd331fe1378c7f9c73217b117 Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Wed, 3 Jul 2024 04:01:57 -1000 Subject: [PATCH 12/19] Update model_ivpin.R --- R/model_ivpin.R | 59 ++++++++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/R/model_ivpin.R b/R/model_ivpin.R index 4228f84..4237f56 100644 --- a/R/model_ivpin.R +++ b/R/model_ivpin.R @@ -794,13 +794,6 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, bucketdata$duration <- bucketdata$duration / mean_duration bucketdata$duration <- ifelse(bucketdata$duration == 0, 1e-6, bucketdata$duration) - # The sigmoid function - sigmoid <- function(x) 1 / (1 + exp(x)) - - # The inverse sigmoid function - logit <- function(y) log(1 / y - 1) - - # The function to be optimized compute_log_likelihood <- function(params, t, Vb, Vs) { alpha <- sigmoid(params[1]) @@ -847,20 +840,34 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, mu_init <- mean(informed_arrival[j:i]) / alpha_init eps_init <- mean(abs(total_arrival_rate[j:i] - informed_arrival[j:i])) / 2 - initial_guess <- c(logit(max(min(alpha_init, 0.999), 0.001)), - logit(max(min(delta_init, 0.999), 0.001)), + initial_guess <- c(max(min(alpha_init, 0.999), 0.001), + max(min(delta_init, 0.999), 0.001), sqrt(mu_init), sqrt(eps_init)) - tryCatch({ - result <- optim(initial_guess, compute_log_likelihood, t = t[j:i], Vb = Vb[j:i], Vs = Vs[j:i], method = "BFGS") - if (result$value < best_log_likelihood && is.finite(result$value)) { - best_params <- result$par - best_log_likelihood <- result$value - exit_flag <- TRUE - } + + # Define the lower and upper bounds for the optimization variables + low <- c(0, 0, 0, 0) # Lower bounds + up <- c(1, 1, Inf, Inf) # Upper bounds + + # Perform the constrained optimization + result <- tryCatch({ + nloptr::nloptr( + x0 = initial_guess, + eval_f = function(params) compute_log_likelihood(params, t[j:i], Vb[j:i], Vs[j:i]), + lb = low, + ub = up, + opts = list(algorithm = "NLOPT_LN_NELDERMEAD", xtol_rel = 1.0e-8) + ) }, error = function(e) { - conditionMessage(e) + message("Error during optimization: ", conditionMessage(e)) + return(NULL) }) + + if (!is.null(result) && result$objective < best_log_likelihood && is.finite(result$objective)) { + best_params <- result$solution + best_log_likelihood <- result$objective + exit_flag <- TRUE + } } } @@ -872,6 +879,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # Compute iVPIN with a rolling window by iterating over each bucket. We perfom a grid search to find the optimal initial parameters of the # first bucket, and for each subsequent bucket, we use the previous bucket's initial parametes as the current initial paramenters. + start_index <- samplength + 1 for (i in start_index:nrow(bucketdata)) { j <- i - samplength parms <- rep(-Inf, 4) # alpha, delta, mu, eps @@ -885,11 +893,17 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, best_log_likelihood <- result[[2]] exit_flag <- result[[3]] } else { - initial_guess <- c(logit(best_params[1]), logit(best_params[2]), sqrt(best_params[3]), sqrt(best_params[4])) + initial_guess <- c(best_params[1], best_params[2], sqrt(best_params[3]), sqrt(best_params[4])) tryCatch({ - result <- optim(initial_guess, compute_log_likelihood, t = t[j:i], Vb = Vb[j:i], Vs = Vs[j:i], method = "BFGS") - if (result$value < best_log_likelihood && is.finite(result$value)) { - best_params <- result$par + result <- nloptr::nloptr( + x0 = initial_guess, + eval_f = function(params) compute_log_likelihood(params, t[j:i], Vb[j:i], Vs[j:i]), + lb = c(0, 0, 0, 0), + ub = c(1, 1, Inf, Inf), + opts = list(algorithm = "NLOPT_LN_NELDERMEAD", xtol_rel = 1.0e-8) + ) + if (result$objective < best_log_likelihood && is.finite(result$objective)) { + best_params <- result$solution exit_flag <- TRUE } else { result <- perform_grid_search(best_log_likelihood, exit_flag, i, j, informed_arrival, total_arrival_rate, t, Vb, Vs) @@ -898,7 +912,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, exit_flag <- result[[3]] } }, error = function(e) { - conditionMessage(e) + message("Error during optimization: ", conditionMessage(e)) result <- perform_grid_search(best_log_likelihood, exit_flag, i, j, informed_arrival, total_arrival_rate, t, Vb, Vs) best_params <- result[[1]] best_log_likelihood <- result[[2]] @@ -907,7 +921,6 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, } if (exit_flag == TRUE) { - best_params[1:2] <- sigmoid(best_params[1:2]) best_params[3:4] <- best_params[3:4] ^ 2 ivpin_estimate <- best_params[1] * best_params[3] / (2 * best_params[4] + best_params[3]) ivpin[i] <- ivpin_estimate From 3dca53a3cf0fc634c7b66e4c0aad7ff9d304bb42 Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Wed, 10 Jul 2024 01:01:01 -1000 Subject: [PATCH 13/19] Update model_ivpin.R --- R/model_ivpin.R | 620 +++++++++++------------------------------------- 1 file changed, 143 insertions(+), 477 deletions(-) diff --git a/R/model_ivpin.R b/R/model_ivpin.R index 4237f56..eb8cbfa 100644 --- a/R/model_ivpin.R +++ b/R/model_ivpin.R @@ -134,16 +134,18 @@ #' #' @export ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, - tradinghours = 24, verbose = TRUE) { + tradinghours = 24, slow == FALSE, verbose = TRUE) { " @timebarsize : the size of timebars in seconds default value: 60 @buckets : number of buckets per volume of bucket size default value: 50 - @samplength : sample length or window of buckets to calculate iVPIN default + @samplength : sample length or window of buckets to calculate VPIN default value: 50 @tradinghours : length of trading days - used to correct the durations of buckets. Default value is 24. " + data <- as.data.frame(data) + vpin_err <- uierrors$vpin() vpin_ms <- uix$vpin(timebarsize = timebarsize) @@ -153,7 +155,8 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, environment(.xcheck$existence) <- environment() .xcheck$existence(allvars, err = uierrors$vpin()$fn) - # Checking the arguments of the function are valid + # Checking the the arguments of the function are valid + # Check that all arguments are valid # ------------------------------------------------------------------------- largs <- list(data, timebarsize, buckets, samplength, tradinghours, verbose) names(largs) <- names(formals()) @@ -163,13 +166,14 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # Convert data into a dataframe, in case it is a matrix or an array data <- as.data.frame(data) - # Initialize the local variables + # # initialize the local variables tbv <- vbs <- bucket <- NULL - estimateivpin <- new("estimate.ivpin") + estimatevpin <- new("estimate.vpin") + time_on <- Sys.time() - ux$show(c= verbose, m = vpin_ms$start) + ux$show(c= verbose, m = ivpin_ms$start) ############################################################################## # STEP 0 : CHECKING AND PREPARING THE DATA @@ -179,13 +183,14 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # USER MESSAGE # ---------------------------------------------------------------------------- - ux$show(c= verbose, m = vpin_ms$step1) + ux$show(c= verbose, m = ivpin_ms$step1) # We want only three columns: timestamp, price and volume so we import into # the dataset only the three first columns dataset <- data[, 1:3] + # We rename the first three columns to "timestamp", "price" and "volume" colnames(dataset) <- c("timestamp", "price", "volume") @@ -210,7 +215,8 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, ux$show(m = vpin_ms$aborted, warning = TRUE) ux$stopnow(m = vpin_err$largetimebarsize, s = vpin_err$fn) } - + + ############################################################################## # STEP 1 : CREATING THE T-SECOND TIMEBARS DATASET ############################################################################## @@ -219,13 +225,13 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # USER MESSAGE # ---------------------------------------------------------------------------- - ux$show(c= verbose, m = vpin_ms$step2, skip = FALSE) + ux$show(c= verbose, m = ivpin_ms$step2, skip = FALSE) # ---------------------------------------------------------------------------- # I.1 CREATE THE VARIABLE INTERVAL # ---------------------------------------------------------------------------- - # Create a variable called interval which contains the timebar extracted from + # create a variable called interval which contains the timebar extracted from # the timestamp. If timebarsize == 60, then the observation of timestamp # "2019-04-01 00:33:49" belong to the interval "2019-04-01 00:33:00", that is # the timebar that starts at the minute 33 and lasts 1 minute (60 seconds) @@ -247,6 +253,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # (tbv: timebar volume) and price change: last price - first price (dp) per # interval + # ---------------------------------------------------------------------------- # - THIS PART IS JUST TO ESTIMATE RUNNING TIME - NO OUTPUT USED - # - - @@ -262,7 +269,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, chunk <- 5000 tempbars <- aggregate(price ~ interval, data = dataset[1:chunk, ], - FUN = function(x) dp <- tail(x, 1) - head(x, 1) + FUN = function(x) dp <- tail(x, 1) - head(x, 1) ) tempbars <- merge(tempbars, aggregate(volume ~ interval, dataset[1:chunk, ], sum), @@ -273,7 +280,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, temptime_off <- Sys.time() exptime <- ux$timediff(temptime_on, temptime_off, - 5*log2(nrow(dataset) / (chunk))) + 5*log2(nrow(dataset) / (chunk))) ux$show(c= verbose, m = paste("[~", ceiling(exptime), "seconds]")) @@ -286,7 +293,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # ---------------------------------------------------------------------------- minutebars <- aggregate(price ~ interval, data = dataset, - FUN = function(x) dp <- tail(x, 1) - head(x, 1) + FUN = function(x) dp <- tail(x, 1) - head(x, 1) ) minutebars <- merge(minutebars, aggregate(volume ~ interval, dataset, sum), @@ -304,7 +311,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # USER MESSAGE # -------------------------------------------------------------------------- - ux$show(c= verbose, m = vpin_ms$step3) + ux$show(c= verbose, m = ivpin_ms$step3) # -------------------------------------------------------------------------- # II.1 CALCULATE NUMBER OF DAYS (NDAYS), TOTAL VOLUME (TOTVOL) AND THEN VBS @@ -324,377 +331,24 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, names(params) <- c("tbSize", "buckets", "samplength", "VBS", "ndays") - estimateivpin@parameters <- params + estimatevpin@parameters <- params # -------------------------------------------------------------------------- # II.2 CALCULATE THE STANDARD DEVIATION OF DP (SDP) # -------------------------------------------------------------------------- - sdp <- sd(minutebars$dp) - - ############################################################################ - # STEP 3 : BREAKING UP LARGE T-MINUTE TIME BARS' VOLUME - ############################################################################ - - # -------------------------------------------------------------------------- - # USER MESSAGE - # -------------------------------------------------------------------------- - - ux$show(c= verbose, m = vpin_ms$step4) - - # -------------------------------------------------------------------------- - # TASK DESCRIPTON - # -------------------------------------------------------------------------- - - # To avoid that one time bar spans over many buckets, we have to make sure - # to divide all large enough timebars into 'identical' timebars but with trade - # volume lower or equal to a given threshold. The threshold is a function of - # vbs. - - # The threshold is chosen in the code to be the following function of vbs: - # threshold = (1-1/x)*vbs where x is a measure of precision. If x=1, then the - # threshold is 0;if x=2 then the threshold = 50% of vbs; if x=10 then the - # threshold = 90% vbs. - - # We give an example here about what we intend to do. Assume that vbs= 1250 - # and x=5; the threshold is then: threshold = (1-1/5) vbs = 80% vbs = - # 0.8*1250 = 1000. This means we will break all minutebars with volume (tbv) - # higher than 1000 into identical timebars but with volume lower or equal - # to threshold= 1000. - - # Imagine we have the following timebar: - # - # minute dp tbv - # ------------------------------- - # 2019-04-02 04:53 53.0 5340. - - # Since threshold= 1000 we want to break this timebar into six 'identical' - # timebars (time and dp) but will volume (tbv) smaller or equal to threshold. - # The expected result is: - # - # minute dp tbv - # ------------------------------- - # 2019-04-02 04:53 53.0 1000. - # 2019-04-02 04:53 53.0 1000. - # 2019-04-02 04:53 53.0 1000. - # 2019-04-02 04:53 53.0 1000. - # 2019-04-02 04:53 53.0 1000. - # 2019-04-02 04:53 53.0 340. - - # -------------------------------------------------------------------------- - # III.1 DEFINE THE THRESHOLD AND FIND ALL TIMEBARS WITH VOLUME > THRESHOLD - # -------------------------------------------------------------------------- - - # A variable 'id' is assigned to each timebar to easily identify them in - # future tasks. - - - minutebars$id <- seq.int(nrow(minutebars)) - - # Define the precision factor (x) and calculate the threshold - - x <- 10 - threshold <- (1 - 1 / x) * vbs - - # Find all timebars with volume higher than the threshold and store them in - # largebars - # Find the position of all these timebars in the original dataset and store - # them in the vector largebnum - - largebars <- subset(minutebars, tbv > threshold) - # # -------------------------------------------------------------------------- - # # III.2 BREAK DOWN LARGE VOLUME TIMEBARS INTO SMALLER ONES - # # -------------------------------------------------------------------------- - - # We break down these large volume timebars into smaller timebars with a - # maximum volume equal to threshold. - # The code recursively splits large timebars into smaller ones and removes - # the original large timebars from the final dataset, until only - # timebars with volumes below the threshold remain. - - # If, for example, the initial minute bar with interval 10:00:00 - 10:01:00 - # has a total volume (tbv) of 2200, which exceeds the threshold of 1000, and - # is composed in the following trades: - - # trades price volume - # --------------------------------- - # 2023-06-30 10:00:05 1 500 - # 2023-06-30 10:00:10 1 600 - # 2023-06-30 10:00:20 1 700 - # 2023-06-30 10:00:30 1 400 - - # The first pass splits this initial minute bar into two 30-second intervals: - - # interval dp tbv - # --------------------------------- - # 2023-06-30 10:00:00 0 1800 (first 3 trades) - # 2023-06-30 10:00:30 1 400 (fourth trade) - - # The first 30-second interval with 1800 volume exceeds the threshold, - # so it will be split further into two 15-second intervals: - # - # interval dp tbv - # --------------------------------- - # 2023-06-30 10:00:00 0 1100 (first 2 trades) - # 2023-06-30 10:00:15 1 700 (third trade) - # 2023-06-30 10:00:30 1 400 (fourth trade) - - # The first 15-second interval with 1100 volume still exceeds the threshold, - # so it will be split further into two intervals: - - # interval dp tbv - # --------------------------------- - # 2023-06-30 10:00:00 0 500 (first trade) - # 2023-06-30 10:00:07.5 1 600 (second trade) - # 2023-06-30 10:00:30 1 400 (fourth trade) - - # The final timebars with acceptable volumes are: - - # interval dp tbv - # --------------------------------- - # 2023-06-30 10:00:00 0 500 (first trade) - # 2023-06-30 10:00:07.5 1 600 (second trade) - # 2023-06-30 10:00:15 1 700 (third trade) - # 2023-06-30 10:00:30 1 400 (last trade) - - - ##### NEW CODE TO BREAK MINUTE BARS AND ADJUST FOR 0 DURATION - - # this funtion recursive function split_timebars that splits large timebars - # into smaller ones until they have the correct volume - split_timebars <- function(data, dataset, interval_duration, threshold) { - final_timebars <- data.frame() - - # Function to split the data recursively - recursive_split <- function(data, interval_duration) { - large_timebars <- subset(data, tbv > threshold) - - if (nrow(large_timebars) == 0) { - # If no large timebars, add the data to final timebars - final_timebars <<- rbind(final_timebars, data) - return() - } - - for (i in seq_len(nrow(large_timebars))) { - large_timebar <- large_timebars[i, ] - trades <- subset(dataset, interval == large_timebar$interval) - - interval_start <- as.POSIXct(large_timebar$interval, tz = "") - midpoint <- interval_start + interval_duration / 2 - - first_half_trades <- subset(trades, timestamp <= midpoint) - second_half_trades <- subset(trades, timestamp > midpoint) - - first_interval <- format(interval_start, "%Y-%m-%d %H:%M:%S") - second_interval <- format(midpoint, "%Y-%m-%d %H:%M:%S") - - first_half <- data.frame( - interval = first_interval, - dp = ifelse(nrow(first_half_trades) > 0, tail(first_half_trades$price, 1) - head(first_half_trades$price, 1), 0), - tbv = sum(first_half_trades$volume) - ) - - second_half <- data.frame( - interval = second_interval, - dp = ifelse(nrow(second_half_trades) > 0, tail(second_half_trades$price, 1) - head(second_half_trades$price, 1), 0), - tbv = sum(second_half_trades$volume) - ) - - # Call the recursive split on the new intervals if their volumes are too large - if (first_half$tbv > threshold) { - recursive_split(first_half, interval_duration / 2) - } else { - final_timebars <<- rbind(final_timebars, first_half) - } - - if (second_half$tbv > threshold) { - recursive_split(second_half, interval_duration / 2) - } else { - final_timebars <<- rbind(final_timebars, second_half) - } - - # Remove the large time bar being split from the final_timebars - final_timebars <<- final_timebars[final_timebars$interval != large_timebar$interval, ] - } - } - - # Start the recursive split with the initial data and interval duration - recursive_split(data, interval_duration) - return(final_timebars) - } + sdp <- sd(minutebars$dp) - largebars <- subset(minutebars, tbv > threshold) - if (nrow(largebars) > 0) { - split_bars <- split_timebars(largebars, dataset, timebarsize, threshold) - minutebars <- minutebars[-which(minutebars$tbv > threshold), ] - minutebars <- rbind(minutebars, split_bars) - minutebars <- minutebars[order(as.POSIXct(minutebars$interval, format = "%Y-%m-%d %H:%M:%S")), ] - } - - # Reassign new identifiers to time bars - minutebars$id <- seq.int(nrow(minutebars)) - - ############################################################################ - # STEP 4 : ASSIGNING T-MINUTE TIME BARS INTO BUCKETS + # STEP 3 : CALCULATING BUY AND SELL VOLUME ############################################################################ # -------------------------------------------------------------------------- # USER MESSAGE # -------------------------------------------------------------------------- - ux$show(c= verbose, m = vpin_ms$step5) - - # -------------------------------------------------------------------------- - # IV.1 ASSIGN A BUCKET TO EACH TIMEBAR | FIND EXCESS VOLUME FOR EACH OF THEM - # -------------------------------------------------------------------------- - - # Find the cumulative volume (runvol) over all minutebars - - minutebars$runvol <- cumsum(minutebars$tbv) - - # Find the bucket to which each timebar belongs, using integer division by the - # volume bucket size (vbs). If vbs = 100 and the cumulative volume is 70; it - # means that we have not yet filled the first bucket so the timebar should - # belong to bucket 1. We do that using the integer division 70/100 which - # gives 0 and to which we add 1 to obtain bucket.In general the bucket size - # is obtained by integer-dividing the cumulative volume (runvol) by vbs and - # adding +1. If the cumulative volume is 472, we know that it must be in - # bucket 5. Indeed, the formula gives us bucket 5 since 472%/%100+1=4+1 = 5 - - minutebars$bucket <- 1 + minutebars$runvol %/% vbs - - # The variable excess volume (exvol) calculates the excess volume after we - # have filled the bucket. As in the example above, if the timebar has - # cumulative volume runvol = 472. The excess volume should be 72; which is - # what is left after filling 4 buckets. Since we are in bucket 5; the excess - # volume can be obtained as 72 = runvol - (5-1)*vbs = 472 - 4*100 - # In general the formula: exvol = runvol - (bucket -1)*vbs - - minutebars$exvol <- minutebars$runvol - (minutebars$bucket - 1) * vbs - - ############################################################################ - # STEP 5 : BALANCING TIMEBARS AND ADJUSTING BUCKET SIZES TO VBS - ############################################################################ - - # -------------------------------------------------------------------------- - # USER MESSAGE - # -------------------------------------------------------------------------- - - ux$show(c= verbose, m = vpin_ms$step6) - - # -------------------------------------------------------------------------- - # TASK DESCRIPTION - # -------------------------------------------------------------------------- - - # We will give attention here to timebars that occur between buckets, that is - # have volume that belongs to two buckets at the same time. Assume vbs=100 and - # we have the following: - # - # interval dp tbv runvol bucket exvol - # -------------------------------------------------------- - # 2019-04-02 04:52 14.0 50. 90 1 90 - # 2019-04-02 04:53 53.0 30. 120 2 20 - # - # The timebar of the interval "2019-04-02 04:52" has a tbv of 30. A volume of - # 10 belongs to bucket 1 that used to have 90 and needs 10 to reach vbs=100; - # and a volume of 20 belongs to bucket 2 (which we can find in the excess - # volume exvol). Basically, for each first timebar of a bucket, the volume - # that belongs to the previous bucket is tbv - exvol (30-20=10 in our - # example) and the volume that belongs to the current bucket is exvol (20 in - # our example.). - # - # In order to have balanced buckets, we want our code to 'divide' the first - # timebar of bucket 2 into 'identical' timebars one containing a volume of - # 10 and belonging to bucket 1 and the other containing a volume of 20 and - # belonging to bucket 2. The output shall look like. - # - # interval dp tbv runvol bucket exvol - # -------------------------------------------------------- - # 2019-04-02 04:52 14.0 50. 90 1 90 - # 2019-04-02 04:53 53.0 10. 120 1 100 - # 2019-04-02 04:53 53.0 20. 120 2 20 - - # -------------------------------------------------------------------------- - # V.1 FINDING THE FIRST TIMEBAR IN EACH BUCKET - # -------------------------------------------------------------------------- - - # Find the first timebar in each bucket, we will have to ignore the first - # timebar of bucket 1 as it does not share trade volume with a previous bucket - - xtrarows <- - aggregate(. ~ bucket, subset(minutebars, bucket != 1), FUN = head, 1) - - # In our example, xtrarows should contain the following timebar - # - # interval dp tbv runvol bucket exvol - # -------------------------------------------------------- - # 2019-04-02 04:53 53.0 30. 120 2 120 - - # -------------------------------------------------------------------------- - # V.2 CHANGE THE VOLUME OF FIRST TIMEBAR IN EACH BUCKET TO EXVOL - # -------------------------------------------------------------------------- - - # In order to change any row in the main dataset, we need the row identifier - # (id). For the timebars that we have extracted in the dataframe xtrarows, we - # find their identifiers and store them in the vector xtraNum - - xtrarnum <- which(minutebars$id %in% xtrarows$id) - - # Now that we have found all the identifiers; for each minutebar having an - # identifier in xtrarnum, change the value volume (tbv) to the value of excess - # volume (exvol). - - minutebars[xtrarnum, "tbv"] <- minutebars[xtrarnum, "exvol"] - - # -------------------------------------------------------------------------- - # V.3 CREATE LAST MINUTEBAR IN EACH BUCKET TO REACH VBS - # -------------------------------------------------------------------------- - - # In the task description, we have established that we need to add a replicate - # of the first timebar of each bucket to the previous bucket and containing - # the volume equal to tbv - exvol. Review the task description if this is not - # clear. All such first timebars already exist in the dataframe xtrarows. We - # will just change the bucket number to the one before it and set tbv to - # tbv-exvol. - - xtrarows$tbv <- xtrarows$tbv - xtrarows$exvol - xtrarows$bucket <- xtrarows$bucket - 1 - - # -------------------------------------------------------------------------- - # V.4 CALCULATE ACCUMULATED BUCKET VOLUME FOR THE ORIGINAL DATASET - # -------------------------------------------------------------------------- - - # We add the replicated timebars in xtrarows to the main dataset - # 'minutebars' and sort it by interval and by bucket so we have all bucket - # minutebars next to one another. - - xtrarows$interval <- as.POSIXct(xtrarows$interval, - origin = "1970-01-01", tz = "") - xtrarows <- xtrarows[c("interval", "dp", "tbv", "id", "runvol", "bucket", - "exvol")] - - minutebars <- rbind(minutebars, xtrarows) - minutebars <- minutebars[order(minutebars$interval, minutebars$bucket), ] - - # We calculate accumulated bucket volume which is the exvol for the new values - # of tbv - - minutebars$runvol <- cumsum(minutebars$tbv) - minutebars$exvol <- minutebars$runvol - (minutebars$bucket - 1) * vbs - - rm(xtrarnum, xtrarows) - - ############################################################################ - # STEP 6 : CALCULATING AGGREGATE BUCKET DATA - ############################################################################ - - # -------------------------------------------------------------------------- - # USER MESSAGE - # -------------------------------------------------------------------------- - - ux$show(c= verbose, m = vpin_ms$step7) + ux$show(c= verbose, m = vpin_ms$step4) # -------------------------------------------------------------------------- # VI.1 ASSIGN N(0, 1) PROB. TO EACH TIMEBAR AND CALCULATE BUY/SELL VOLUMES @@ -703,92 +357,114 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # Use the cdf of N(0, 1) to calculate zb = Z(dp/sdp) and zs = 1- Z(dp/sdp) # The variable zb calculates the probability that the current price change # normalized by standard deviation (dp/sdp) corresponds to timebar with - # buyer-initiated transactions, while zs calculates the probability that + # buyer-initiated transacations, while zs calculates the probability that # the current price change normalized by standard deviation (dp/sdp) - # corresponds to timebar with seller-initiated transactions. - + # corresponds to timebar with seller-initiated transacations. minutebars$zb <- pnorm(minutebars$dp / sdp) minutebars$zs <- 1 - pnorm(minutebars$dp / sdp) + # Calculate Buy Volume (bvol) and Sell volume (svol) by multiplying timebar's # volume (tbv) by the corresponding probabilities zb and zs. + minutebars$bvol <- minutebars$tbv * minutebars$zb minutebars$svol <- minutebars$tbv * minutebars$zs + minutebars <- minutebars[which(minutebars$tbv > 0), ] - # -------------------------------------------------------------------------- - # VI.2 CALCULATING AGGREGATE BUCKET DATA - # -------------------------------------------------------------------------- - # Aggregate variables - # ++++++++++++++++++++ - # agg.bvol : sum of buy volume (bvol) per bucket - # agg.svol : sum of sell volume (svol) per bucket - # OI : the difference between agg.bvol and agg.svol - # init.time : the first timebar in the bucket - # final.time : the last timebar in the bucket - # +++++++++++++++++++ - bucketdata <- setNames( - aggregate(cbind(bvol, svol) ~ bucket, minutebars, sum), - c("bucket", "agg.bvol", "agg.svol")) - bucketdata$aoi <- abs(bucketdata$agg.bvol - bucketdata$agg.svol) - bucketdata$starttime <- aggregate(interval ~ bucket, - minutebars, head, 1)$interval - bucketdata$endtime <- aggregate(interval ~ bucket, - minutebars, tail, 1)$interval - bucketdata$duration <- as.numeric( - difftime(bucketdata$endtime, bucketdata$starttime, units = "secs")) - zero_duration_percentage <- sum(bucketdata$duration == 0) / nrow(bucketdata) * 100 - if (zero_duration_percentage > 5) { - warning(sprintf("More than 5%% of duration values are zero, reduce the timebar duration or increase the bucket size.")) - } ############################################################################ - # STEP 7 : CALCULATING iVPIN VECTOR FOLLOWING KE 2017 + # STEP 4 : ASSIGNING TIME BARS TO VOLUME BUCKETS ############################################################################ # -------------------------------------------------------------------------- # USER MESSAGE # -------------------------------------------------------------------------- - ux$show(c= verbose, m = vpin_ms$step8) - - # -------------------------------------------------------------------------- - # TASK DESCRIPTION - # -------------------------------------------------------------------------- + ux$show(c= verbose, m = vpin_ms$step5) - # We calculate IVPIN from Ke et al. 2017 from the following formula - # ivpin=(alpha*mu)/(2*epsilon+mu) where: - # -alpha is the pobability of information event occurrence - # -Mu is the informed traders’ arrival rate - # -Epsilon is the uninformed traders’ arrival rate - # -Delta is the probabilities of bad news - # We estimate IVPIN on a rolling window of length samplelength buckets. We estimate the - # parameters alpha, mu, epsilon, delta by MLE of the function in compute_log_likelihood. + # Initialize bucketdata + bucketdata <- data.frame(bucket = integer(), + agg.bvol = numeric(), + agg.svol = numeric(), + duration = numeric(), + current_volume = numeric()) + + current_bucket <- 1 + current_volume <- 0 + current_duration <- 0 + + for (i in 1:nrow(minutebars)) { + tb <- minutebars[i, ] + remaining_volume <- tb$tbv + tb_duration <- timebarsize + + while (remaining_volume > 0) { + if (current_volume + remaining_volume <= vbs) { + # Assign entire remaining volume to the current bucket + current_volume <- current_volume + remaining_volume + current_duration <- current_duration + tb_duration + + # Update bucketdata with current bucket information + if (nrow(bucketdata) >= current_bucket) { + bucketdata$agg.bvol[current_bucket] <- bucketdata$agg.bvol[current_bucket] + tb$bvol + bucketdata$agg.svol[current_bucket] <- bucketdata$agg.svol[current_bucket] + tb$svol + bucketdata$duration[current_bucket] <- current_duration + bucketdata$current_volume[current_bucket] <- current_volume + } else { + bucketdata <- rbind(bucketdata, data.frame( + bucket = current_bucket, + agg.bvol = tb$bvol, + agg.svol = tb$svol, + duration = current_duration, + current_volume = current_volume + )) + } - # -------------------------------------------------------------------------- - # VII.2 CORRECTING THE DURATION VECTOR - # -------------------------------------------------------------------------- + remaining_volume <- 0 + } else { + # Fill the current bucket to vbs and create a new bucket + vol_to_add <- vbs - current_volume + proportion_to_add <- vol_to_add / tb$tbv + current_volume <- vbs + current_duration <- current_duration + tb_duration * proportion_to_add + + # Update bucketdata with current bucket information + if (nrow(bucketdata) >= current_bucket) { + bucketdata$agg.bvol[current_bucket] <- bucketdata$agg.bvol[current_bucket] + tb$bvol * proportion_to_add + bucketdata$agg.svol[current_bucket] <- bucketdata$agg.svol[current_bucket] + tb$svol * proportion_to_add + bucketdata$duration[current_bucket] <- current_duration + bucketdata$current_volume[current_bucket] <- current_volume + } else { + bucketdata <- rbind(bucketdata, data.frame( + bucket = current_bucket, + agg.bvol = tb$bvol * proportion_to_add, + agg.svol = tb$svol * proportion_to_add, + duration = current_duration, + current_volume = current_volume + )) + } - corduration <- function(etime, stime, duration) { + # Start a new bucket + current_bucket <- current_bucket + 1 + current_volume <- 0 + current_duration <- tb_duration * (1 - proportion_to_add) + remaining_volume <- remaining_volume - vol_to_add + } + } + } - days <- round(as.numeric(difftime(etime, stime, units = "days"))) + ############################################################################ + # STEP 5 : COMPUTE IVPIN + ############################################################################ - duration <- as.numeric(duration) + # -------------------------------------------------------------------------- + # USER MESSAGE + # -------------------------------------------------------------------------- - if (days > 0) { - dseconds <- (days - 1) * 3600 * 24 + (24 - tradinghours) * 3600 - return(duration - dseconds) - } - return(duration) - } + ux$show(c= verbose, m = vpin_ms$step6) - if (tradinghours > 0 & tradinghours < 24) { - bucketdata$duration <- apply(bucketdata, 1, function(x) - corduration(x[6], x[5], x[10])) - } - print(nrow(data)) - # Normalize the duration to be centered around 1 mean_duration <- mean(bucketdata$duration) bucketdata$duration <- bucketdata$duration / mean_duration @@ -796,10 +472,10 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # The function to be optimized compute_log_likelihood <- function(params, t, Vb, Vs) { - alpha <- sigmoid(params[1]) - delta <- sigmoid(params[2]) - mu <- params[3] ^ 2 - eps <- params[4] ^ 2 + alpha <- params[1] + delta <- params[2] + mu <- params[3] + eps <- params[4] e_1 <- log(alpha * delta) + Vb * log(eps) + Vs * log(eps + mu) - (2 * eps + mu) * t e_2 <- log(alpha * (1 - delta)) + Vb * log(eps + mu) + Vs * log(eps) - (2 * eps + mu) * t @@ -832,18 +508,16 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, ivpin <- numeric(nrow(bucketdata)) # Ensure ivpin is initialized - perform_grid_search <- function(best_log_likelihood, exit_flag, i, j, informed_arrival, total_arrival_rate, t, Vb, Vs) { + perform_grid_search <- function(i, j, informed_arrival, total_arrival_rate, t, Vb, Vs) { best_params <- NULL - + best_log_likelihood <- Inf + exit_flag <- FALSE for (alpha_init in seq(0.1, 0.9, by = 0.2)) { for (delta_init in seq(0.1, 0.9, by = 0.2)) { mu_init <- mean(informed_arrival[j:i]) / alpha_init eps_init <- mean(abs(total_arrival_rate[j:i] - informed_arrival[j:i])) / 2 - initial_guess <- c(max(min(alpha_init, 0.999), 0.001), - max(min(delta_init, 0.999), 0.001), - sqrt(mu_init), - sqrt(eps_init)) + initial_guess <- c(alpha_init, delta_init, mu_init, eps_init) # Define the lower and upper bounds for the optimization variables low <- c(0, 0, 0, 0) # Lower bounds @@ -851,77 +525,71 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # Perform the constrained optimization result <- tryCatch({ - nloptr::nloptr( + nloptr::slsqp( x0 = initial_guess, - eval_f = function(params) compute_log_likelihood(params, t[j:i], Vb[j:i], Vs[j:i]), + fn = function(params) compute_log_likelihood(params, t[j:i], Vb[j:i], Vs[j:i]), lb = low, - ub = up, - opts = list(algorithm = "NLOPT_LN_NELDERMEAD", xtol_rel = 1.0e-8) + ub = up ) }, error = function(e) { message("Error during optimization: ", conditionMessage(e)) return(NULL) }) - if (!is.null(result) && result$objective < best_log_likelihood && is.finite(result$objective)) { - best_params <- result$solution - best_log_likelihood <- result$objective + if (!is.null(result) && !is.na(result$value) && result$value < best_log_likelihood) { + best_params <- result$par + best_log_likelihood <- result$value exit_flag <- TRUE } } } - return(list(best_params, best_log_likelihood, exit_flag)) } - - # We start the computation of IVPIN at the samplength th index + + # We start the computation of IVPIN at the samplength-th index start_index <- samplength + 1 - # Compute iVPIN with a rolling window by iterating over each bucket. We perfom a grid search to find the optimal initial parameters of the # first bucket, and for each subsequent bucket, we use the previous bucket's initial parametes as the current initial paramenters. - start_index <- samplength + 1 for (i in start_index:nrow(bucketdata)) { j <- i - samplength parms <- rep(-Inf, 4) # alpha, delta, mu, eps best_log_likelihood <- Inf exit_flag <- FALSE - - if (i == start_index) { - result <- perform_grid_search(best_log_likelihood, exit_flag, i, j, informed_arrival, total_arrival_rate, t, Vb, Vs) + + # compute the first bucket's ivpin with grid search + # if slow is passed as TRUE, compute all buckets with grid search + if (i == start_index || slow == TRUE) { + result <- perform_grid_search(i, j, informed_arrival, total_arrival_rate, t, Vb, Vs) best_params <- result[[1]] best_log_likelihood <- result[[2]] exit_flag <- result[[3]] } else { - initial_guess <- c(best_params[1], best_params[2], sqrt(best_params[3]), sqrt(best_params[4])) + # remaining buckets use the previous bucket's parameters as initial guess for the optimization + initial_guess <- c(best_params[1], best_params[2], best_params[3], best_params[4]) + initial_guess <- c(0.5, 0.5, 50, 100) tryCatch({ - result <- nloptr::nloptr( - x0 = initial_guess, - eval_f = function(params) compute_log_likelihood(params, t[j:i], Vb[j:i], Vs[j:i]), - lb = c(0, 0, 0, 0), - ub = c(1, 1, Inf, Inf), - opts = list(algorithm = "NLOPT_LN_NELDERMEAD", xtol_rel = 1.0e-8) + result <- nloptr::slsqp( + initial_guess, + fn = function(params) compute_log_likelihood(params, t[j:i], Vb[j:i], Vs[j:i]), + lower = c(0, 0, 0, 0), + upper = c(1, 1, Inf, Inf) ) - if (result$objective < best_log_likelihood && is.finite(result$objective)) { - best_params <- result$solution + if (!is.null(result) && !is.na(result$value) && result$value < best_log_likelihood) { + best_params <- result$par exit_flag <- TRUE } else { - result <- perform_grid_search(best_log_likelihood, exit_flag, i, j, informed_arrival, total_arrival_rate, t, Vb, Vs) + # if the previous parameters failed, perform grid search + result <- perform_grid_search(i, j, informed_arrival, total_arrival_rate, t, Vb, Vs) best_params <- result[[1]] best_log_likelihood <- result[[2]] - exit_flag <- result[[3]] + exit_flag <- FALSE } }, error = function(e) { message("Error during optimization: ", conditionMessage(e)) - result <- perform_grid_search(best_log_likelihood, exit_flag, i, j, informed_arrival, total_arrival_rate, t, Vb, Vs) - best_params <- result[[1]] - best_log_likelihood <- result[[2]] - exit_flag <- result[[3]] }) } - if (exit_flag == TRUE) { - best_params[3:4] <- best_params[3:4] ^ 2 ivpin_estimate <- best_params[1] * best_params[3] / (2 * best_params[4] + best_params[3]) ivpin[i] <- ivpin_estimate } else { @@ -933,10 +601,8 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, estimateivpin@ivpin <- ivpin estimatevpin@runningtime <- ux$timediff(time_on, time_off) num_of_nan <- sum(is.na(ivpin)) - if (num_of_nan > 0) { - warning(sprintf("Optimization failed for %d buckets", num_of_nan)) - } ux$show(c= verbose, m = vpin_ms$complete) + + return(ivpin) - return(estimateivpin) } From a51ff8b39950504a6c4fef8f2dc3cdf560fc1fef Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Wed, 10 Jul 2024 01:02:51 -1000 Subject: [PATCH 14/19] Update DESCRIPTION --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 27c869d..43a659b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,7 +18,7 @@ LazyDataCompression: xz RoxygenNote: 7.2.1 Roxygen: list(markdown = TRUE) VignetteBuilder: knitr -Imports: Rdpack, knitr, methods, skellam, nloptr, furrr, future, dplyr, rmarkdown, coda, lubridate +Imports: Rdpack, knitr, methods, skellam, nloptr, furrr, future, dplyr, rmarkdown, coda RdMacros: Rdpack Depends: R (>= 3.5.0) Suggests: fansi, htmltools From 90ed216def7cc5c9af610b4eecdc98d285dfed35 Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Wed, 10 Jul 2024 01:47:19 -1000 Subject: [PATCH 15/19] Update output_classes.R --- R/output_classes.R | 93 +--------------------------------------------- 1 file changed, 1 insertion(+), 92 deletions(-) diff --git a/R/output_classes.R b/R/output_classes.R index 87814b6..1845585 100644 --- a/R/output_classes.R +++ b/R/output_classes.R @@ -268,7 +268,7 @@ setMethod( # load the digits for display of decimals digits <- getOption("PIN-digits") - interface <- uiclasses$vpin(object) + interface <- ifelse(improved, uiclasses$ivpin(object), uiclasses$vpin(object)) ux$show(m = interface$line) ux$show(m = interface$outcome) @@ -301,97 +301,6 @@ setMethod( } ) - - -# ---------------------------------------------------------------------------- # -# IVPIN Classes # -# ---------------------------------------------------------------------------- # - -#' @title IVPIN estimation results -#' -#' @description The class \code{estimate.ivpin} is a blueprint for `S4` objects -#' that store the results of the `IVPIN` estimation method using the function -#' \code{ivpin()}. -#' @slot success (`logical`) returns the value \code{TRUE} when the estimation -#' has succeeded, \code{FALSE} otherwise. -#' @slot errorMessage (`character`) returns an error message if the `IVPIN` -#' estimation has failed, and is empty otherwise. -#' @slot parameters (`numeric`) returns a numeric vector of estimation -#' parameters (tbSize, buckets, samplength, VBS, #days), where `tbSize` is the -#' size of timebars (in seconds); `buckets` is the number of buckets per average -#' volume day; `VBS` is Volume Bucket Size (daily average volume/number of -#' buckets `buckets`); `samplength` is the length of the window used to estimate -#' `IVPIN`; and `#days` is the number of days in the dataset. -#' @slot bucketdata (`dataframe`) returns the dataframe containing detailed -#' information about buckets. -#' @slot ivpin (`numeric`) returns the vector of the volume-synchronized -#' probabilities of informed trading. -#' @slot dailyivpin (`dataframe`) returns the daily `IVPIN` values. Two -#' variants are provided for any given day: \code{divpin} corresponds to -#' the unweighted average of ivpin values, and \code{divpin.weighted} -#' corresponds to the average of ivpin values weighted by bucket duration. -#' @slot runningtime (`numeric`) returns the running time of the `IVPIN` -#' estimation in seconds. -setClass( - "estimate.ivpin", - slots = list( - success = "logical", errorMessage = "character", parameters = "numeric", - bucketdata = "data.frame", ivpin = "numeric", dailyivpin = "data.frame", - runningtime = "numeric" - ), - prototype = list( - success = TRUE, errorMessage = "", parameters = 0, - bucketdata = data.frame(), ivpin = 0, dailyivpin = data.frame(), - runningtime = 0 - ) -) - -#' @rdname estimate.ivpin-class -#' @description The function show() displays a description of the -#' estimate.ivpin object: descriptive statistics of the `IVPIN` variable, -#' the set of relevant parameters, and the running time. -#' @param object an object of class \code{estimate.ivpin} -setMethod( - "show", signature(object = "estimate.ivpin"), - function(object) { - - # load the digits for display of decimals - digits <- getOption("PIN-digits") - - interface <- uiclasses$ivpin(object) - - ux$show(m = interface$line) - ux$show(m = interface$outcome) - ux$show(m = interface$line) - ux$show(m = interface$ivpinfunctions) - - if (object@success) { - - ux$show(m = interface$badge, skip = FALSE) - - xsummary <- interface$ivpinsummary - - show(knitr::kable(xsummary, "simple", padding = 1L, align = "c", - caption = interface$summarycaption - )) - - xparams <- interface$ivpinparams - - show(knitr::kable(xparams, "simple", padding = 1L, align = "c", - caption = interface$paramscaption - )) - - } else { - - ux$show(m = interface$error, warning = TRUE) - } - - ux$show(m = interface$runningtime) - - } -) - - # ---------------------------------------------------------------------------- # # MPIN Classes # # ---------------------------------------------------------------------------- # From 43ae5a4e266043b2f390da68240a785e8ad5a962 Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Wed, 10 Jul 2024 01:48:53 -1000 Subject: [PATCH 16/19] Update model_ivpin.R --- R/model_ivpin.R | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/R/model_ivpin.R b/R/model_ivpin.R index eb8cbfa..41dd2df 100644 --- a/R/model_ivpin.R +++ b/R/model_ivpin.R @@ -134,7 +134,7 @@ #' #' @export ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, - tradinghours = 24, slow == FALSE, verbose = TRUE) { + tradinghours = 24, slow = FALSE, grid_size = 0.2 verbose = TRUE) { " @timebarsize : the size of timebars in seconds default value: 60 @@ -143,6 +143,10 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, value: 50 @tradinghours : length of trading days - used to correct the durations of buckets. Default value is 24. + @slow : TRUE computes the initial parameters by grid search at each + bucket + @grid_size : spacing between elements of the grid search. Smaller is more precise + default value: 0.2 " data <- as.data.frame(data) @@ -508,12 +512,12 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, ivpin <- numeric(nrow(bucketdata)) # Ensure ivpin is initialized - perform_grid_search <- function(i, j, informed_arrival, total_arrival_rate, t, Vb, Vs) { + perform_grid_search <- function(i, j, informed_arrival, total_arrival_rate, t, Vb, Vs, grid_size) { best_params <- NULL best_log_likelihood <- Inf exit_flag <- FALSE - for (alpha_init in seq(0.1, 0.9, by = 0.2)) { - for (delta_init in seq(0.1, 0.9, by = 0.2)) { + for (alpha_init in seq(0.1, 0.9, by = grid_size)) { + for (delta_init in seq(0.1, 0.9, by = grid_size)) { mu_init <- mean(informed_arrival[j:i]) / alpha_init eps_init <- mean(abs(total_arrival_rate[j:i] - informed_arrival[j:i])) / 2 @@ -536,7 +540,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, return(NULL) }) - if (!is.null(result) && !is.na(result$value) && result$value < best_log_likelihood) { + if (!inherits(result, "try-error") && result$value < best_log_likelihood) { best_params <- result$par best_log_likelihood <- result$value exit_flag <- TRUE @@ -556,16 +560,13 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, best_log_likelihood <- Inf exit_flag <- FALSE - - # compute the first bucket's ivpin with grid search - # if slow is passed as TRUE, compute all buckets with grid search + if (i == start_index || slow == TRUE) { - result <- perform_grid_search(i, j, informed_arrival, total_arrival_rate, t, Vb, Vs) + result <- perform_grid_search(i, j, informed_arrival, total_arrival_rate, t, Vb, Vs, grid_size) best_params <- result[[1]] best_log_likelihood <- result[[2]] exit_flag <- result[[3]] } else { - # remaining buckets use the previous bucket's parameters as initial guess for the optimization initial_guess <- c(best_params[1], best_params[2], best_params[3], best_params[4]) initial_guess <- c(0.5, 0.5, 50, 100) tryCatch({ @@ -575,12 +576,11 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, lower = c(0, 0, 0, 0), upper = c(1, 1, Inf, Inf) ) - if (!is.null(result) && !is.na(result$value) && result$value < best_log_likelihood) { + if (!inherits(result, "try-error") && result$value < best_log_likelihood) { best_params <- result$par exit_flag <- TRUE } else { - # if the previous parameters failed, perform grid search - result <- perform_grid_search(i, j, informed_arrival, total_arrival_rate, t, Vb, Vs) + result <- perform_grid_search(i, j, informed_arrival, total_arrival_rate, t, Vb, Vs, grid_size) best_params <- result[[1]] best_log_likelihood <- result[[2]] exit_flag <- FALSE @@ -598,9 +598,12 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, } } - estimateivpin@ivpin <- ivpin + estimatevpin@ivpin <- ivpin estimatevpin@runningtime <- ux$timediff(time_on, time_off) num_of_nan <- sum(is.na(ivpin)) + if (num_of_nan > 0) { + warning(sprintf("Optimization failed for %d buckets", num_of_nan)) + } ux$show(c= verbose, m = vpin_ms$complete) return(ivpin) From bbb7608ffde856ea5a087ee6b6fd20b760215ca2 Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Fri, 2 Aug 2024 17:07:59 -1000 Subject: [PATCH 17/19] Update model_ivpin.R --- R/model_ivpin.R | 222 ++++++++++++++++++++++-------------------------- 1 file changed, 100 insertions(+), 122 deletions(-) diff --git a/R/model_ivpin.R b/R/model_ivpin.R index 41dd2df..65d735a 100644 --- a/R/model_ivpin.R +++ b/R/model_ivpin.R @@ -133,8 +133,8 @@ #' show(head(buckets, 10)) #' #' @export -ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, - tradinghours = 24, slow = FALSE, grid_size = 0.2 verbose = TRUE) { +ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 100, + tradinghours = 24, slow = FALSE, grid_size = 0.2, verbose = TRUE) { " @timebarsize : the size of timebars in seconds default value: 60 @@ -150,34 +150,34 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, " data <- as.data.frame(data) - vpin_err <- uierrors$vpin() - vpin_ms <- uix$vpin(timebarsize = timebarsize) + # vpin_err <- uierrors$vpin() + # vpin_ms <- uix$vpin(timebarsize = timebarsize) # Check that all variables exist and do not refer to non-existent variables # -------------------------------------------------------------------------- - allvars <- names(formals()) - environment(.xcheck$existence) <- environment() - .xcheck$existence(allvars, err = uierrors$vpin()$fn) + # allvars <- names(formals()) + # environment(.xcheck$existence) <- environment() + # .xcheck$existence(allvars, err = uierrors$vpin()$fn) # Checking the the arguments of the function are valid # Check that all arguments are valid # ------------------------------------------------------------------------- - largs <- list(data, timebarsize, buckets, samplength, tradinghours, verbose) + largs <- list(data, timebarsize, buckets, samplength, tradinghours, slow, grid_size, verbose) names(largs) <- names(formals()) - rst <- .xcheck$args(arglist = largs, fn = "vpin") - ux$stopnow(rst$off, m = rst$error, s = uierrors$vpin()$fn) + # rst <- .xcheck$args(arglist = largs, fn = "vpin") + # ux$stopnow(rst$off, m = rst$error, s = uierrors$vpin()$fn) # Convert data into a dataframe, in case it is a matrix or an array data <- as.data.frame(data) # # initialize the local variables tbv <- vbs <- bucket <- NULL - estimatevpin <- new("estimate.vpin") + # estimatevpin <- new("estimate.vpin") time_on <- Sys.time() - ux$show(c= verbose, m = ivpin_ms$start) + # ux$show(c= verbose, m = ivpin_ms$start) ############################################################################## # STEP 0 : CHECKING AND PREPARING THE DATA @@ -187,7 +187,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # USER MESSAGE # ---------------------------------------------------------------------------- - ux$show(c= verbose, m = ivpin_ms$step1) + # ux$show(c= verbose, m = ivpin_ms$step1) # We want only three columns: timestamp, price and volume so we import into # the dataset only the three first columns @@ -215,10 +215,10 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, alltime <- 1000 * as.numeric(difftime(max(dataset$timestamp), min(dataset$timestamp), units = "secs")) - if (timebarsize >= alltime) { - ux$show(m = vpin_ms$aborted, warning = TRUE) - ux$stopnow(m = vpin_err$largetimebarsize, s = vpin_err$fn) - } + # if (timebarsize >= alltime) { + # ux$show(m = vpin_ms$aborted, warning = TRUE) + # ux$stopnow(m = vpin_err$largetimebarsize, s = vpin_err$fn) + # } ############################################################################## @@ -229,7 +229,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # USER MESSAGE # ---------------------------------------------------------------------------- - ux$show(c= verbose, m = ivpin_ms$step2, skip = FALSE) + # ux$show(c= verbose, m = ivpin_ms$step2, skip = FALSE) # ---------------------------------------------------------------------------- # I.1 CREATE THE VARIABLE INTERVAL @@ -261,37 +261,37 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # ---------------------------------------------------------------------------- # - THIS PART IS JUST TO ESTIMATE RUNNING TIME - NO OUTPUT USED - # - - - diffseconds <- function(time_on, time_off) { - dsecs <- difftime(time_off, time_on, units = "secs") - return(dsecs) - } + # diffseconds <- function(time_on, time_off) { + # dsecs <- difftime(time_off, time_on, units = "secs") + # return(dsecs) + # } - if (nrow(dataset) >= 50000) { + # if (nrow(dataset) >= 50000) { - temptime_on <- Sys.time() + # temptime_on <- Sys.time() - chunk <- 5000 + # chunk <- 5000 - tempbars <- aggregate(price ~ interval, data = dataset[1:chunk, ], - FUN = function(x) dp <- tail(x, 1) - head(x, 1) - ) - tempbars <- merge(tempbars, - aggregate(volume ~ interval, dataset[1:chunk, ], sum), - by = "interval" - ) - tempbars$interval <- as.POSIXct(tempbars$interval, tz = "") + # tempbars <- aggregate(price ~ interval, data = dataset[1:chunk, ], + # FUN = function(x) dp <- tail(x, 1) - head(x, 1) + # ) + # tempbars <- merge(tempbars, + # aggregate(volume ~ interval, dataset[1:chunk, ], sum), + # by = "interval" + # ) + # tempbars$interval <- as.POSIXct(tempbars$interval, tz = "") - temptime_off <- Sys.time() + # temptime_off <- Sys.time() - exptime <- ux$timediff(temptime_on, temptime_off, - 5*log2(nrow(dataset) / (chunk))) + # exptime <- ux$timediff(temptime_on, temptime_off, + # 5*log2(nrow(dataset) / (chunk))) - ux$show(c= verbose, m = paste("[~", ceiling(exptime), "seconds]")) + # ux$show(c= verbose, m = paste("[~", ceiling(exptime), "seconds]")) - } else { + # } else { - ux$show(c= verbose, m = "") - } + # # ux$show(c= verbose, m = "") + # } # - - # ---------------------------------------------------------------------------- @@ -315,7 +315,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # USER MESSAGE # -------------------------------------------------------------------------- - ux$show(c= verbose, m = ivpin_ms$step3) + # ux$show(c= verbose, m = ivpin_ms$step3) # -------------------------------------------------------------------------- # II.1 CALCULATE NUMBER OF DAYS (NDAYS), TOTAL VOLUME (TOTVOL) AND THEN VBS @@ -335,7 +335,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, names(params) <- c("tbSize", "buckets", "samplength", "VBS", "ndays") - estimatevpin@parameters <- params + # estimatevpin@parameters <- params # -------------------------------------------------------------------------- # II.2 CALCULATE THE STANDARD DEVIATION OF DP (SDP) @@ -352,7 +352,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # USER MESSAGE # -------------------------------------------------------------------------- - ux$show(c= verbose, m = vpin_ms$step4) + # ux$show(c= verbose, m = vpin_ms$step4) # -------------------------------------------------------------------------- # VI.1 ASSIGN N(0, 1) PROB. TO EACH TIMEBAR AND CALCULATE BUY/SELL VOLUMES @@ -385,79 +385,59 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # USER MESSAGE # -------------------------------------------------------------------------- - ux$show(c= verbose, m = vpin_ms$step5) + # ux$show(c= verbose, m = vpin_ms$step5) # Initialize bucketdata - bucketdata <- data.frame(bucket = integer(), - agg.bvol = numeric(), - agg.svol = numeric(), - duration = numeric(), - current_volume = numeric()) - + num_buckets <- ceiling(sum(minutebars$tbv) / vbs) + bucketdata <- data.frame( + bucket = 1:num_buckets, + agg.bvol = numeric(num_buckets), + agg.svol = numeric(num_buckets), + duration = numeric(num_buckets), + current_volume = numeric(num_buckets), + stringsAsFactors = FALSE + ) + current_bucket <- 1 - current_volume <- 0 - current_duration <- 0 - + remaining_volume <- vbs + for (i in 1:nrow(minutebars)) { - tb <- minutebars[i, ] - remaining_volume <- tb$tbv - tb_duration <- timebarsize - - while (remaining_volume > 0) { - if (current_volume + remaining_volume <= vbs) { - # Assign entire remaining volume to the current bucket - current_volume <- current_volume + remaining_volume - current_duration <- current_duration + tb_duration - - # Update bucketdata with current bucket information - if (nrow(bucketdata) >= current_bucket) { - bucketdata$agg.bvol[current_bucket] <- bucketdata$agg.bvol[current_bucket] + tb$bvol - bucketdata$agg.svol[current_bucket] <- bucketdata$agg.svol[current_bucket] + tb$svol - bucketdata$duration[current_bucket] <- current_duration - bucketdata$current_volume[current_bucket] <- current_volume - } else { - bucketdata <- rbind(bucketdata, data.frame( - bucket = current_bucket, - agg.bvol = tb$bvol, - agg.svol = tb$svol, - duration = current_duration, - current_volume = current_volume - )) - } - - remaining_volume <- 0 + bar_volume <- minutebars$tbv[i] + bar_bvol <- minutebars$bvol[i] + bar_svol <- minutebars$svol[i] + bar_duration <- 60 + + while (bar_volume > 0 && current_bucket <= num_buckets) { + if (bar_volume <= remaining_volume) { + # The entire bar fits in the current bucket + proportion <- 1 + volume_to_add <- bar_volume } else { - # Fill the current bucket to vbs and create a new bucket - vol_to_add <- vbs - current_volume - proportion_to_add <- vol_to_add / tb$tbv - current_volume <- vbs - current_duration <- current_duration + tb_duration * proportion_to_add - - # Update bucketdata with current bucket information - if (nrow(bucketdata) >= current_bucket) { - bucketdata$agg.bvol[current_bucket] <- bucketdata$agg.bvol[current_bucket] + tb$bvol * proportion_to_add - bucketdata$agg.svol[current_bucket] <- bucketdata$agg.svol[current_bucket] + tb$svol * proportion_to_add - bucketdata$duration[current_bucket] <- current_duration - bucketdata$current_volume[current_bucket] <- current_volume - } else { - bucketdata <- rbind(bucketdata, data.frame( - bucket = current_bucket, - agg.bvol = tb$bvol * proportion_to_add, - agg.svol = tb$svol * proportion_to_add, - duration = current_duration, - current_volume = current_volume - )) - } - - # Start a new bucket + # spill over + proportion <- remaining_volume / bar_volume + volume_to_add <- remaining_volume + } + + bucketdata$agg.bvol[current_bucket] <- bucketdata$agg.bvol[current_bucket] + proportion * bar_bvol + bucketdata$agg.svol[current_bucket] <- bucketdata$agg.svol[current_bucket] + proportion * bar_svol + bucketdata$duration[current_bucket] <- bucketdata$duration[current_bucket] + proportion * bar_duration + bucketdata$current_volume[current_bucket] <- bucketdata$current_volume[current_bucket] + volume_to_add + + # Update remaining values + bar_volume <- bar_volume - volume_to_add + bar_bvol <- bar_bvol * (1 - proportion) + bar_svol <- bar_svol * (1 - proportion) + bar_duration <- bar_duration * (1 - proportion) + remaining_volume <- remaining_volume - volume_to_add + + # If the bucket is full, move to the next one + if (remaining_volume == 0 && current_bucket <= num_buckets) { current_bucket <- current_bucket + 1 - current_volume <- 0 - current_duration <- tb_duration * (1 - proportion_to_add) - remaining_volume <- remaining_volume - vol_to_add + remaining_volume <- vbs } } - } + } ############################################################################ # STEP 5 : COMPUTE IVPIN @@ -467,7 +447,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # USER MESSAGE # -------------------------------------------------------------------------- - ux$show(c= verbose, m = vpin_ms$step6) + # ux$show(c= verbose, m = vpin_ms$step6) # Normalize the duration to be centered around 1 mean_duration <- mean(bucketdata$duration) @@ -510,7 +490,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, Vb <- bucketdata$agg.bvol Vs <- bucketdata$agg.svol - ivpin <- numeric(nrow(bucketdata)) # Ensure ivpin is initialized + ivpin <- numeric(nrow(bucketdata)) perform_grid_search <- function(i, j, informed_arrival, total_arrival_rate, t, Vb, Vs, grid_size) { best_params <- NULL @@ -523,24 +503,22 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, initial_guess <- c(alpha_init, delta_init, mu_init, eps_init) - # Define the lower and upper bounds for the optimization variables low <- c(0, 0, 0, 0) # Lower bounds up <- c(1, 1, Inf, Inf) # Upper bounds - # Perform the constrained optimization result <- tryCatch({ - nloptr::slsqp( + nloptr::neldermead( x0 = initial_guess, fn = function(params) compute_log_likelihood(params, t[j:i], Vb[j:i], Vs[j:i]), - lb = low, - ub = up + lower = low, + upper = up ) }, error = function(e) { message("Error during optimization: ", conditionMessage(e)) return(NULL) }) - if (!inherits(result, "try-error") && result$value < best_log_likelihood) { + if (!is.null(result) && result$value < best_log_likelihood) { best_params <- result$par best_log_likelihood <- result$value exit_flag <- TRUE @@ -568,15 +546,15 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, exit_flag <- result[[3]] } else { initial_guess <- c(best_params[1], best_params[2], best_params[3], best_params[4]) - initial_guess <- c(0.5, 0.5, 50, 100) + tryCatch({ - result <- nloptr::slsqp( + result <- nloptr::neldermead( initial_guess, fn = function(params) compute_log_likelihood(params, t[j:i], Vb[j:i], Vs[j:i]), lower = c(0, 0, 0, 0), upper = c(1, 1, Inf, Inf) ) - if (!inherits(result, "try-error") && result$value < best_log_likelihood) { + if (!is.null(result) && result$value < best_log_likelihood) { best_params <- result$par exit_flag <- TRUE } else { @@ -598,14 +576,14 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, } } - estimatevpin@ivpin <- ivpin - estimatevpin@runningtime <- ux$timediff(time_on, time_off) + # estimatevpin@ivpin <- ivpin + # estimatevpin@runningtime <- ux$timediff(time_on, time_off) num_of_nan <- sum(is.na(ivpin)) if (num_of_nan > 0) { warning(sprintf("Optimization failed for %d buckets", num_of_nan)) } - ux$show(c= verbose, m = vpin_ms$complete) - + # ux$show(c= verbose, m = vpin_ms$complete) + return(ivpin) } From 96de8c61cd7dca5b0334754183ed8a909d1d2226 Mon Sep 17 00:00:00 2001 From: Alexandre Borentain Cristea <149905334+alecriste@users.noreply.github.com> Date: Fri, 2 Aug 2024 17:09:33 -1000 Subject: [PATCH 18/19] Update model_ivpin.R --- R/model_ivpin.R | 78 ++++++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/R/model_ivpin.R b/R/model_ivpin.R index 65d735a..f85975d 100644 --- a/R/model_ivpin.R +++ b/R/model_ivpin.R @@ -150,29 +150,29 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 100, " data <- as.data.frame(data) - # vpin_err <- uierrors$vpin() - # vpin_ms <- uix$vpin(timebarsize = timebarsize) + vpin_err <- uierrors$vpin() + vpin_ms <- uix$vpin(timebarsize = timebarsize) # Check that all variables exist and do not refer to non-existent variables # -------------------------------------------------------------------------- - # allvars <- names(formals()) - # environment(.xcheck$existence) <- environment() - # .xcheck$existence(allvars, err = uierrors$vpin()$fn) + allvars <- names(formals()) + environment(.xcheck$existence) <- environment() + .xcheck$existence(allvars, err = uierrors$vpin()$fn) # Checking the the arguments of the function are valid # Check that all arguments are valid # ------------------------------------------------------------------------- largs <- list(data, timebarsize, buckets, samplength, tradinghours, slow, grid_size, verbose) names(largs) <- names(formals()) - # rst <- .xcheck$args(arglist = largs, fn = "vpin") - # ux$stopnow(rst$off, m = rst$error, s = uierrors$vpin()$fn) + rst <- .xcheck$args(arglist = largs, fn = "vpin") + ux$stopnow(rst$off, m = rst$error, s = uierrors$vpin()$fn) # Convert data into a dataframe, in case it is a matrix or an array data <- as.data.frame(data) # # initialize the local variables tbv <- vbs <- bucket <- NULL - # estimatevpin <- new("estimate.vpin") + estimatevpin <- new("estimate.vpin") time_on <- Sys.time() @@ -187,7 +187,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 100, # USER MESSAGE # ---------------------------------------------------------------------------- - # ux$show(c= verbose, m = ivpin_ms$step1) + ux$show(c= verbose, m = ivpin_ms$step1) # We want only three columns: timestamp, price and volume so we import into # the dataset only the three first columns @@ -261,37 +261,37 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 100, # ---------------------------------------------------------------------------- # - THIS PART IS JUST TO ESTIMATE RUNNING TIME - NO OUTPUT USED - # - - - # diffseconds <- function(time_on, time_off) { - # dsecs <- difftime(time_off, time_on, units = "secs") - # return(dsecs) - # } + diffseconds <- function(time_on, time_off) { + dsecs <- difftime(time_off, time_on, units = "secs") + return(dsecs) + } - # if (nrow(dataset) >= 50000) { + if (nrow(dataset) >= 50000) { - # temptime_on <- Sys.time() + temptime_on <- Sys.time() - # chunk <- 5000 + chunk <- 5000 - # tempbars <- aggregate(price ~ interval, data = dataset[1:chunk, ], - # FUN = function(x) dp <- tail(x, 1) - head(x, 1) - # ) - # tempbars <- merge(tempbars, - # aggregate(volume ~ interval, dataset[1:chunk, ], sum), - # by = "interval" - # ) - # tempbars$interval <- as.POSIXct(tempbars$interval, tz = "") + tempbars <- aggregate(price ~ interval, data = dataset[1:chunk, ], + FUN = function(x) dp <- tail(x, 1) - head(x, 1) + ) + tempbars <- merge(tempbars, + aggregate(volume ~ interval, dataset[1:chunk, ], sum), + by = "interval" + ) + tempbars$interval <- as.POSIXct(tempbars$interval, tz = "") - # temptime_off <- Sys.time() + temptime_off <- Sys.time() - # exptime <- ux$timediff(temptime_on, temptime_off, - # 5*log2(nrow(dataset) / (chunk))) + exptime <- ux$timediff(temptime_on, temptime_off, + 5*log2(nrow(dataset) / (chunk))) - # ux$show(c= verbose, m = paste("[~", ceiling(exptime), "seconds]")) + ux$show(c= verbose, m = paste("[~", ceiling(exptime), "seconds]")) - # } else { + } else { - # # ux$show(c= verbose, m = "") - # } + ux$show(c= verbose, m = "") + } # - - # ---------------------------------------------------------------------------- @@ -315,7 +315,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 100, # USER MESSAGE # -------------------------------------------------------------------------- - # ux$show(c= verbose, m = ivpin_ms$step3) + ux$show(c= verbose, m = ivpin_ms$step3) # -------------------------------------------------------------------------- # II.1 CALCULATE NUMBER OF DAYS (NDAYS), TOTAL VOLUME (TOTVOL) AND THEN VBS @@ -335,7 +335,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 100, names(params) <- c("tbSize", "buckets", "samplength", "VBS", "ndays") - # estimatevpin@parameters <- params + estimatevpin@parameters <- params # -------------------------------------------------------------------------- # II.2 CALCULATE THE STANDARD DEVIATION OF DP (SDP) @@ -352,7 +352,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 100, # USER MESSAGE # -------------------------------------------------------------------------- - # ux$show(c= verbose, m = vpin_ms$step4) + ux$show(c= verbose, m = vpin_ms$step4) # -------------------------------------------------------------------------- # VI.1 ASSIGN N(0, 1) PROB. TO EACH TIMEBAR AND CALCULATE BUY/SELL VOLUMES @@ -385,7 +385,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 100, # USER MESSAGE # -------------------------------------------------------------------------- - # ux$show(c= verbose, m = vpin_ms$step5) + ux$show(c= verbose, m = vpin_ms$step5) # Initialize bucketdata @@ -447,7 +447,7 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 100, # USER MESSAGE # -------------------------------------------------------------------------- - # ux$show(c= verbose, m = vpin_ms$step6) + ux$show(c= verbose, m = vpin_ms$step6) # Normalize the duration to be centered around 1 mean_duration <- mean(bucketdata$duration) @@ -576,13 +576,13 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 100, } } - # estimatevpin@ivpin <- ivpin - # estimatevpin@runningtime <- ux$timediff(time_on, time_off) + estimatevpin@ivpin <- ivpin + estimatevpin@runningtime <- ux$timediff(time_on, time_off) num_of_nan <- sum(is.na(ivpin)) if (num_of_nan > 0) { warning(sprintf("Optimization failed for %d buckets", num_of_nan)) } - # ux$show(c= verbose, m = vpin_ms$complete) + ux$show(c= verbose, m = vpin_ms$complete) return(ivpin) From 514d745ae1203f1e75f230f79ee973b648c687c1 Mon Sep 17 00:00:00 2001 From: Alexandre Pierre-Henri Borentain <149905334+alecriste@users.noreply.github.com> Date: Thu, 12 Dec 2024 22:01:13 -0800 Subject: [PATCH 19/19] Update model_vpin.R --- R/model_vpin.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/model_vpin.R b/R/model_vpin.R index 325717e..15291bc 100644 --- a/R/model_vpin.R +++ b/R/model_vpin.R @@ -1447,7 +1447,9 @@ ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, optimal <- as.list(findMLE(previously_optimal, mldata)) names(optimal) <- c("alpha", "delta", "mu", "eb", "es", "likelihood", "conv") - if(optimal$conv >= 0){ + if(optimal$conv >= 0 & + optimal$alpha > 0 && optimal$alpha < 1 && + optimal$delta > 0 && optimal$delta < 1){ bucketdata$ivpin[k] <- with(optimal, (alpha * mu)/(eb + es + mu)) previously_optimal <- unlist(optimal[1:5]) # Update the progressbar