content
| - %META:TOPICPARENT{name="VirtTipsAndTricksGuide"}%
---+Splitting 3.4billion triple UniProt datasets file Guide
---++What?
How to split up the Uniprot's 3.4 billion triples dataset en route to bulk loading into a Virtuoso instance.
---++Why?
Attempting to load single file of this magnitude is an inefficient and problem prone undertaking for any RDF store. Hence the need to break up the file prior to bulk loading.
---++How?
The following script splits 3.4billion triple uniprot datasets file into smaller files.
The last line of the script tells which file should be split and where the result should go.
The splitter occupies only one CPU core due to its linear nature, so many files could be split in parallel on any multi-core box.
create procedure DB.DBA.RDFXML_FILE_SPLIT_INIT (in out_fname_tmpl varchar, in cut_size integer, inout app_env any)
{
app_env := vector (
out_fname_tmpl, -- [0] - template for out names
string_output (), -- [1] - out session
0, -- [2] - index of current file
iri_id_num (#ib1), -- [3] - ID of next bnode to allocate
dict_new (1000000), -- [4] - dictionary of bnodes
make_array (50000, 'any'), -- [5] - accumulator of triples
0, -- [6] - number of triples in the accumulator
vector (dict_new (16000), 0, '', '', '', 0, 0, 0, 0), -- [7] env of http_ttl_xxx()
0, -- [8] - count of tripless written to the current file
cut_size ); -- [9] - size of single cut file
}
;
create procedure DB.DBA.RDFXML_FILE_SPLIT_FLUSH (inout app_env any, in can_continue_file integer)
{
declare tctr, tcount, total_tcount, cut_size integer;
declare triples, env, ses any;
dbg_obj_princ ('DB.DBA.RDFXML_FILE_SPLIT_FLUSH (..., can_continue_file=', can_continue_file, '): file=', app_env[2], ' tcount=', app_env[6], ' total_count=', app_env[8]);
ses := aref_set_0 (app_env, 1);
triples := aref_set_0 (app_env, 5);
tcount := app_env[6];
env := aref_set_0 (app_env, 7);
total_tcount := app_env[8];
cut_size := app_env[9];
for (tctr := 0; tctr < tcount; tctr := tctr + 1)
{
http_ttl_prefixes (env, triples[tctr][0], triples[tctr][1], triples[tctr][2], ses);
}
for (tctr := 0; tctr < tcount; tctr := tctr + 1)
{
http_ttl_triple (env, triples[tctr][0], triples[tctr][1], triples[tctr][2], ses);
}
app_env[6] := 0;
total_tcount := total_tcount + tcount;
if ((not can_continue_file) or (total_tcount >= cut_size))
{
string_to_file (sprintf (app_env[0], app_env[2]), ses, -2);
ses := string_output ();
env := vector (dict_new (16000), 0, '', '', '', 0, 0, 0, 0);
app_env[2] := app_env[2] + 1;
total_tcount := 0;
}
aset_zap_arg (app_env, 1, ses);
aset_zap_arg (app_env, 5, triples);
aset_zap_arg (app_env, 7, env);
app_env[8] := total_tcount;
}
;
create procedure DB.DBA.RDFXML_FILE_SPLIT_EV_NEW_BLANK (inout g_iid IRI_ID, inout app_env any, inout res IRI_ID) {
declare i integer;
i := app_env[3];
res := iri_id_from_num (i);
app_env[3] := i+1;
}
;
create procedure DB.DBA.RDFXML_FILE_SPLIT_EV_GET_IID (inout uri varchar, inout g_iid IRI_ID, inout app_env any, inout res IRI_ID) {
res := uri;
}
;
create procedure DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE (
inout g_iid IRI_ID, inout s_uri varchar, inout p_uri varchar,
inout o_uri varchar,
inout app_env any )
{
if (app_env[6] >= 50000)
DB.DBA.RDFXML_FILE_SPLIT_FLUSH (app_env, 1);
__box_flags_set (o_uri, 1);
app_env[5][app_env[6]] := vector (s_uri, p_uri, o_uri);
app_env[6] := app_env[6]+1;
}
;
create procedure DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE_L (
inout g_iid IRI_ID, inout s_uri varchar, inout p_uri varchar,
inout o_val any, inout o_type varchar, inout o_lang varchar,
inout app_env any )
{
if (app_env[6] >= 50000)
DB.DBA.RDFXML_FILE_SPLIT_FLUSH (app_env, 1);
app_env[5][app_env[6]] := vector (s_uri, p_uri, DB.DBA.RDF_MAKE_LONG_OF_TYPEDSQLVAL_STRINGS (o_val, o_type, o_lang));
app_env[6] := app_env[6]+1;
}
;
create procedure DB.DBA.RDFXML_FILE_SPLIT (in in_fname varchar, in base varchar, in parse_mode integer, in out_fname_tmpl varchar, in cut_size integer := 100000000)
{
declare in_ses, app_env any;
if (in_fname like '%.rdf.gz' or in_fname like '%.xml.gz')
in_ses := gz_file_open (in_fname);
else
in_ses := file_open (in_fname);
DB.DBA.RDFXML_FILE_SPLIT_INIT (out_fname_tmpl, cut_size, app_env);
rdf_load_rdfxml (in_ses, parse_mode,
'' /* fake graph, UNAME is to avoid copying */,
vector (
'',
'DB.DBA.RDFXML_FILE_SPLIT_EV_NEW_BLANK',
'DB.DBA.RDFXML_FILE_SPLIT_EV_GET_IID',
'DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE',
'DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE_L',
'',
'' ),
app_env,
base );
RDFXML_FILE_SPLIT_FLUSH (app_env, 0);
}
;
DB.DBA.RDFXML_FILE_SPLIT ('/demos/uniprot/src/uniparc.rdf', 'http://purl.uniprot.org/uniparc/', 0, '/demos/uniprot/src/uniparc%06d.ttl', 200000);
---++Related
* [[VirtTipsAndTricksGuide][Virtuoso Tips and Tricks Collection]]
* [[http://docs.openlinksw.com/virtuoso/rdfsparql.html][Virtuoso Documentation]]
|