mirror of
https://github.com/Rucknium/misc-research.git
synced 2024-12-22 11:29:22 +00:00
Piecewise processing for BTC to avoid integer overflow
This commit is contained in:
parent
c0af6a5862
commit
6445948fe8
1 changed files with 87 additions and 17 deletions
|
@ -5,30 +5,99 @@ library(DBI)
|
|||
# NOTE: Also need lubridate package installed, but not loading it due to
|
||||
# it masking functions
|
||||
|
||||
is.btc <- FALSE
|
||||
# Change to TRUE if processing BTC
|
||||
|
||||
data.dir <- ""
|
||||
# Input data directory here, with trailing "/"
|
||||
|
||||
|
||||
con <- DBI::dbConnect(RSQLite::SQLite(), paste0(data.dir, "tx-graph-node-indices.db"))
|
||||
|
||||
master.edgelist <- DBI::dbGetQuery(con,
|
||||
"SELECT origin_index, destination_index,block_height,value FROM edgelist_intermediate_2")
|
||||
|
||||
master.edgelist.output.created <- master.edgelist[
|
||||
(! is.na(master.edgelist$value)) & master.edgelist$value > 0 , c("destination_index", "block_height")]
|
||||
colnames(master.edgelist.output.created) <- c("output_index", "output.created.block_height")
|
||||
setDT(master.edgelist.output.created)
|
||||
|
||||
master.edgelist.output.spent <- master.edgelist[, c("origin_index", "block_height")]
|
||||
colnames(master.edgelist.output.spent) <- c("output_index", "output.spent.block_height")
|
||||
setDT(master.edgelist.output.spent)
|
||||
|
||||
# Only include positive _value_s for output created, since that's the value of the created output
|
||||
# Then, below do an "inner merge" to get the proper outputs on the spending side
|
||||
|
||||
rm(master.edgelist)
|
||||
master.edgelist.output.spent <- merge(master.edgelist.output.created, master.edgelist.output.spent)
|
||||
rm(master.edgelist.output.created)
|
||||
if ( ! is.btc) {
|
||||
|
||||
master.edgelist.output.created <- master.edgelist[
|
||||
(! is.na(master.edgelist$value)) & master.edgelist$value > 0 , c("destination_index", "block_height")]
|
||||
colnames(master.edgelist.output.created) <- c("output_index", "output.created.block_height")
|
||||
setDT(master.edgelist.output.created)
|
||||
|
||||
master.edgelist.output.spent <- master.edgelist[, c("origin_index", "block_height")]
|
||||
colnames(master.edgelist.output.spent) <- c("output_index", "output.spent.block_height")
|
||||
setDT(master.edgelist.output.spent)
|
||||
|
||||
# Only include positive _value_s for output created, since that's the value of the created output
|
||||
# Then, below do an "inner merge" to get the proper outputs on the spending side
|
||||
|
||||
rm(master.edgelist)
|
||||
master.edgelist.output.spent <- merge(master.edgelist.output.created, master.edgelist.output.spent)
|
||||
rm(master.edgelist.output.created)
|
||||
|
||||
} else {
|
||||
|
||||
# Split data requests and processing if BTC to avoid integer overflow issue in this step of processing.
|
||||
# data.table issue:
|
||||
# https://github.com/Rdatatable/data.table/issues/3957
|
||||
# SQLite issue: "negative length vectors are not allowed"
|
||||
|
||||
master.edgelist.lt.500k <- DBI::dbGetQuery(con,
|
||||
"SELECT origin_index, destination_index,block_height,value FROM edgelist_intermediate_2 WHERE block_height < 500000")
|
||||
|
||||
master.edgelist.output.created.lt.500k <- master.edgelist.lt.500k[
|
||||
(! is.na(master.edgelist.lt.500k$value)) & master.edgelist.lt.500k$value > 0 , c("destination_index", "block_height")]
|
||||
colnames(master.edgelist.output.created.lt.500k) <- c("output_index", "output.created.block_height")
|
||||
setDT(master.edgelist.output.created.lt.500k)
|
||||
|
||||
master.edgelist.output.spent.lt.500k <- master.edgelist.lt.500k[, c("origin_index", "block_height")]
|
||||
colnames(master.edgelist.output.spent.lt.500k) <- c("output_index", "output.spent.block_height")
|
||||
setDT(master.edgelist.output.spent.lt.500k)
|
||||
|
||||
rm(master.edgelist.lt.500k)
|
||||
|
||||
|
||||
master.edgelist.gt.500k.lt.700k <- DBI::dbGetQuery(con,
|
||||
"SELECT origin_index, destination_index,block_height,value FROM edgelist_intermediate_2 WHERE block_height > 499999 AND block_height < 700000")
|
||||
|
||||
master.edgelist.output.created.gt.500k.lt.700k <- master.edgelist.gt.500k.lt.700k[
|
||||
(! is.na(master.edgelist.gt.500k.lt.700k$value) & master.edgelist.gt.500k.lt.700k$value > 0 ) , c("destination_index", "block_height")]
|
||||
colnames(master.edgelist.output.created.gt.500k.lt.700k) <- c("output_index", "output.created.block_height")
|
||||
setDT(master.edgelist.output.created.gt.500k.lt.700k)
|
||||
|
||||
master.edgelist.output.spent.gt.500k.lt.700k <- master.edgelist.gt.500k.lt.700k[, c("origin_index", "block_height")]
|
||||
colnames(master.edgelist.output.spent.gt.500k.lt.700k) <- c("output_index", "output.spent.block_height")
|
||||
setDT(master.edgelist.output.spent.gt.500k.lt.700k)
|
||||
|
||||
rm(master.edgelist.gt.500k.lt.700k)
|
||||
|
||||
|
||||
master.edgelist.gt.700k <- DBI::dbGetQuery(con,
|
||||
"SELECT origin_index, destination_index,block_height,value FROM edgelist_intermediate_2 WHERE block_height > 699999")
|
||||
|
||||
master.edgelist.output.created.gt.700k <- master.edgelist.gt.700k[
|
||||
(! is.na(master.edgelist.gt.700k$value) & master.edgelist.gt.700k$value > 0 ), c("destination_index", "block_height")]
|
||||
colnames(master.edgelist.output.created.gt.700k) <- c("output_index", "output.created.block_height")
|
||||
setDT(master.edgelist.output.created.gt.700k)
|
||||
|
||||
master.edgelist.output.spent.gt.700k <- master.edgelist.gt.700k[, c("origin_index", "block_height")]
|
||||
colnames(master.edgelist.output.spent.gt.700k) <- c("output_index", "output.spent.block_height")
|
||||
setDT(master.edgelist.output.spent.gt.700k)
|
||||
|
||||
rm(master.edgelist.gt.700k)
|
||||
|
||||
|
||||
master.edgelist.output.created <- rbindlist(list(master.edgelist.output.created.lt.500k,
|
||||
master.edgelist.output.created.gt.500k.lt.700k, master.edgelist.output.created.gt.700k))
|
||||
|
||||
master.edgelist.output.spent.lt.500k <- merge(master.edgelist.output.created, master.edgelist.output.spent.lt.500k)
|
||||
master.edgelist.output.spent.gt.500k.lt.700k <- merge(master.edgelist.output.created, master.edgelist.output.spent.gt.500k.lt.700k)
|
||||
master.edgelist.output.spent.gt.700k <- merge(master.edgelist.output.created, master.edgelist.output.spent.gt.700k)
|
||||
|
||||
master.edgelist.output.spent <- rbindlist(list(master.edgelist.output.spent.lt.500k,
|
||||
master.edgelist.output.spent.gt.500k.lt.700k, master.edgelist.output.spent.gt.700k))
|
||||
|
||||
rm(master.edgelist.output.created)
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
@ -69,5 +138,6 @@ master.edgelist.output.spent <- merge(master.edgelist.output.spent,
|
|||
|
||||
saveRDS(master.edgelist.output.spent, paste0(data.dir, "master_edgelist_output_spent.rds"))
|
||||
|
||||
saveRDS(master.edgelist.output.spent, paste0(data.dir, "master_edgelist_output_spent-uncompressed.rds"), compress = FALSE)
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue