Not logged in : Login

About: VirtTipsAndTricksGuideSplittingUnProt     Goto   Sponge   NotDistinct   Permalink

An Entity of Type : atom:Entry, within Data Space : ods.openlinksw.com associated with source document(s)

AttributesValues
type
Date Created
Date Modified
label
  • VirtTipsAndTricksGuideSplittingUnProt
maker
Title
  • VirtTipsAndTricksGuideSplittingUnProt
isDescribedUsing
has creator
attachment
content
  • %META:TOPICPARENT{name="VirtTipsAndTricksGuide"}% ---+Splitting 3.4billion triple UniProt datasets file Guide ---++What? How to split up the Uniprot's 3.4 billion triples dataset en route to bulk loading into a Virtuoso instance. ---++Why? Attempting to load single file of this magnitude is an inefficient and problem prone undertaking for any RDF store. Hence the need to break up the file prior to bulk loading. ---++How? The following script splits 3.4billion triple uniprot datasets file into smaller files. The last line of the script tells which file should be split and where the result should go. The splitter occupies only one CPU core due to its linear nature, so many files could be split in parallel on any multi-core box. create procedure DB.DBA.RDFXML_FILE_SPLIT_INIT (in out_fname_tmpl varchar, in cut_size integer, inout app_env any) { app_env := vector ( out_fname_tmpl, -- [0] - template for out names string_output (), -- [1] - out session 0, -- [2] - index of current file iri_id_num (#ib1), -- [3] - ID of next bnode to allocate dict_new (1000000), -- [4] - dictionary of bnodes make_array (50000, 'any'), -- [5] - accumulator of triples 0, -- [6] - number of triples in the accumulator vector (dict_new (16000), 0, '', '', '', 0, 0, 0, 0), -- [7] env of http_ttl_xxx() 0, -- [8] - count of tripless written to the current file cut_size ); -- [9] - size of single cut file } ; create procedure DB.DBA.RDFXML_FILE_SPLIT_FLUSH (inout app_env any, in can_continue_file integer) { declare tctr, tcount, total_tcount, cut_size integer; declare triples, env, ses any; dbg_obj_princ ('DB.DBA.RDFXML_FILE_SPLIT_FLUSH (..., can_continue_file=', can_continue_file, '): file=', app_env[2], ' tcount=', app_env[6], ' total_count=', app_env[8]); ses := aref_set_0 (app_env, 1); triples := aref_set_0 (app_env, 5); tcount := app_env[6]; env := aref_set_0 (app_env, 7); total_tcount := app_env[8]; cut_size := app_env[9]; for (tctr := 0; tctr < tcount; tctr := tctr + 1) { http_ttl_prefixes (env, triples[tctr][0], triples[tctr][1], triples[tctr][2], ses); } for (tctr := 0; tctr < tcount; tctr := tctr + 1) { http_ttl_triple (env, triples[tctr][0], triples[tctr][1], triples[tctr][2], ses); } app_env[6] := 0; total_tcount := total_tcount + tcount; if ((not can_continue_file) or (total_tcount >= cut_size)) { string_to_file (sprintf (app_env[0], app_env[2]), ses, -2); ses := string_output (); env := vector (dict_new (16000), 0, '', '', '', 0, 0, 0, 0); app_env[2] := app_env[2] + 1; total_tcount := 0; } aset_zap_arg (app_env, 1, ses); aset_zap_arg (app_env, 5, triples); aset_zap_arg (app_env, 7, env); app_env[8] := total_tcount; } ; create procedure DB.DBA.RDFXML_FILE_SPLIT_EV_NEW_BLANK (inout g_iid IRI_ID, inout app_env any, inout res IRI_ID) { declare i integer; i := app_env[3]; res := iri_id_from_num (i); app_env[3] := i+1; } ; create procedure DB.DBA.RDFXML_FILE_SPLIT_EV_GET_IID (inout uri varchar, inout g_iid IRI_ID, inout app_env any, inout res IRI_ID) { res := uri; } ; create procedure DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE ( inout g_iid IRI_ID, inout s_uri varchar, inout p_uri varchar, inout o_uri varchar, inout app_env any ) { if (app_env[6] >= 50000) DB.DBA.RDFXML_FILE_SPLIT_FLUSH (app_env, 1); __box_flags_set (o_uri, 1); app_env[5][app_env[6]] := vector (s_uri, p_uri, o_uri); app_env[6] := app_env[6]+1; } ; create procedure DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE_L ( inout g_iid IRI_ID, inout s_uri varchar, inout p_uri varchar, inout o_val any, inout o_type varchar, inout o_lang varchar, inout app_env any ) { if (app_env[6] >= 50000) DB.DBA.RDFXML_FILE_SPLIT_FLUSH (app_env, 1); app_env[5][app_env[6]] := vector (s_uri, p_uri, DB.DBA.RDF_MAKE_LONG_OF_TYPEDSQLVAL_STRINGS (o_val, o_type, o_lang)); app_env[6] := app_env[6]+1; } ; create procedure DB.DBA.RDFXML_FILE_SPLIT (in in_fname varchar, in base varchar, in parse_mode integer, in out_fname_tmpl varchar, in cut_size integer := 100000000) { declare in_ses, app_env any; if (in_fname like '%.rdf.gz' or in_fname like '%.xml.gz') in_ses := gz_file_open (in_fname); else in_ses := file_open (in_fname); DB.DBA.RDFXML_FILE_SPLIT_INIT (out_fname_tmpl, cut_size, app_env); rdf_load_rdfxml (in_ses, parse_mode, '' /* fake graph, UNAME is to avoid copying */, vector ( '', 'DB.DBA.RDFXML_FILE_SPLIT_EV_NEW_BLANK', 'DB.DBA.RDFXML_FILE_SPLIT_EV_GET_IID', 'DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE', 'DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE_L', '', '' ), app_env, base ); RDFXML_FILE_SPLIT_FLUSH (app_env, 0); } ; DB.DBA.RDFXML_FILE_SPLIT ('/demos/uniprot/src/uniparc.rdf', 'http://purl.uniprot.org/uniparc/', 0, '/demos/uniprot/src/uniparc%06d.ttl', 200000); ---++Related * [[VirtTipsAndTricksGuide][Virtuoso Tips and Tricks Collection]] * [[http://docs.openlinksw.com/virtuoso/rdfsparql.html][Virtuoso Documentation]]
id
  • 7505540fd1a7bd2b04553f8b2df53466
link
has container
http://rdfs.org/si...ices#has_services
atom:title
  • VirtTipsAndTricksGuideSplittingUnProt
links to
atom:source
atom:author
atom:published
  • 2017-06-13T05:39:41Z
atom:updated
  • 2017-06-13T05:39:41Z
topic
is made of
is container of of
is link of
is http://rdfs.org/si...vices#services_of of
is links to of
is creator of of
is atom:entry of
is atom:contains of
Faceted Search & Find service v1.17_git132 as of May 12 2023


Alternative Linked Data Documents: iSPARQL | ODE     Content Formats:   [cxml] [csv]     RDF   [text] [turtle] [ld+json] [rdf+json] [rdf+xml]     ODATA   [atom+xml] [odata+json]     Microdata   [microdata+json] [html]    About   
This material is Open Knowledge   W3C Semantic Web Technology [RDF Data] Valid XHTML + RDFa
OpenLink Virtuoso version 07.20.3238 as of May 23 2023, on Linux (x86_64-generic-linux-glibc25), Single-Server Edition (15 GB total memory, 3 GB memory in use)
Data on this page belongs to its respective rights holders.
Virtuoso Faceted Browser Copyright © 2009-2024 OpenLink Software