mirror of
https://github.com/Rucknium/misc-research.git
synced 2025-01-31 06:15:51 +00:00
Delete duplicates afterward instead of continuously for speedup
This commit is contained in:
parent
6445948fe8
commit
f613b58ea7
1 changed files with 28 additions and 9 deletions
|
@ -9,16 +9,19 @@ library(DBI)
|
||||||
data.dir <- ""
|
data.dir <- ""
|
||||||
# Input data directory here, with trailing "/"
|
# Input data directory here, with trailing "/"
|
||||||
|
|
||||||
source("https://gist.githubusercontent.com/jeffwong/5925000/raw/bf02ed0dd2963169a91664be02fb18e45c4d1e20/sqlitewritetable.R")
|
|
||||||
# From https://gist.github.com/jeffwong/5925000
|
|
||||||
# Modifies RSQLite's sqliteWriteTable function so as to reject duplicates
|
|
||||||
|
|
||||||
con <- DBI::dbConnect(RSQLite::SQLite(), paste0(data.dir, "tx-graph-node-indices.db"))
|
con <- DBI::dbConnect(RSQLite::SQLite(), paste0(data.dir, "tx-graph-node-indices.db"))
|
||||||
|
|
||||||
|
DBI::dbExecute(con, "PRAGMA max_page_count = 4294967292;")
|
||||||
|
# Allows SQL database files up to 4 TB. See:
|
||||||
|
# https://stackoverflow.com/questions/16685016/sqlite3-operationalerror-database-or-disk-is-full-on-lustre
|
||||||
|
|
||||||
|
# DBI::dbExecute(con, "PRAGMA temp_store = 2;")
|
||||||
|
# Put temp file in RAM:
|
||||||
|
# https://stackoverflow.com/a/19259699
|
||||||
|
|
||||||
DBI::dbExecute(con, "CREATE TABLE nodes (
|
DBI::dbExecute(con, "CREATE TABLE nodes (
|
||||||
node TEXT,
|
node TEXT,
|
||||||
node_index INTEGER PRIMARY KEY AUTOINCREMENT,
|
node_index INTEGER PRIMARY KEY
|
||||||
unique(node)
|
|
||||||
)")
|
)")
|
||||||
|
|
||||||
|
|
||||||
|
@ -60,15 +63,31 @@ for (file.iter in tx.graph.files) {
|
||||||
|
|
||||||
new.nodes <- unique(c(tx.graph.chunk$origin, tx.graph.chunk$destination))
|
new.nodes <- unique(c(tx.graph.chunk$origin, tx.graph.chunk$destination))
|
||||||
|
|
||||||
nodes.to.insert <- data.frame(node = new.nodes, node_index = NA, stringsAsFactors = FALSE)
|
nodes.to.insert <- data.frame(node = new.nodes, stringsAsFactors = FALSE)
|
||||||
|
|
||||||
|
DBI::dbWriteTable(con, "nodes",
|
||||||
|
nodes.to.insert, append = TRUE)
|
||||||
|
|
||||||
mysqliteWriteTable(con, "nodes",
|
|
||||||
nodes.to.insert, append = TRUE, row.names = FALSE, ignore = TRUE)
|
|
||||||
|
|
||||||
cat(nrow(nodes.to.insert), "Nodes written\n")
|
cat(nrow(nodes.to.insert), "Nodes written\n")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
base::date()
|
||||||
|
DBI::dbExecute(con, "DELETE FROM nodes
|
||||||
|
WHERE rowid NOT IN (
|
||||||
|
SELECT MIN(rowid)
|
||||||
|
FROM nodes
|
||||||
|
GROUP BY node
|
||||||
|
)")
|
||||||
|
base::date()
|
||||||
|
# 1 hour
|
||||||
|
|
||||||
|
# Suggested by https://stackoverflow.com/questions/25884095/how-can-i-delete-duplicates-in-sqlite
|
||||||
|
# Must do this at the end to make sure did not leave any out
|
||||||
|
# from the tail end of the last 1000 iters
|
||||||
|
|
||||||
DBI::dbWriteTable(con, "edgelist_intermediate_1",
|
DBI::dbWriteTable(con, "edgelist_intermediate_1",
|
||||||
data.frame(origin = character(0), destination = character(0),
|
data.frame(origin = character(0), destination = character(0),
|
||||||
|
|
Loading…
Reference in a new issue