. "%META:TOPICPARENT{name=\"VirtTipsAndTricksGuide\"}%\n\n\n---+Splitting 3.4billion triple UniProt datasets file Guide\n\n---++What?\nHow to split up the Uniprot's 3.4 billion triples dataset en route to bulk loading into a Virtuoso instance. \n\n---++Why?\n\nAttempting to load single file of this magnitude is an inefficient and problem prone undertaking for any RDF store. Hence the need to break up the file prior to bulk loading.\n\n---++How?\n\nThe following script splits 3.4billion triple uniprot datasets file into smaller files.\n\nThe last line of the script tells which file should be split and where the result should go. \n\nThe splitter occupies only one CPU core due to its linear nature, so many files could be split in parallel on any multi-core box.\n\n\ncreate procedure DB.DBA.RDFXML_FILE_SPLIT_INIT (in out_fname_tmpl varchar, in cut_size integer, inout app_env any)\n{\n app_env := vector (\n out_fname_tmpl,\t\t\t-- [0] - template for out names\n string_output (),\t\t\t-- [1] - out session\n 0,\t\t\t\t\t-- [2] - index of current file\n iri_id_num (#ib1),\t\t\t-- [3] - ID of next bnode to allocate\n dict_new (1000000),\t\t\t-- [4] - dictionary of bnodes\n make_array (50000, 'any'),\t\t-- [5] - accumulator of triples\n 0,\t\t\t\t\t-- [6] - number of triples in the accumulator\n vector (dict_new (16000), 0, '', '', '', 0, 0, 0, 0), -- [7] env of http_ttl_xxx()\n 0,\t\t\t\t\t-- [8] - count of tripless written to the current file\n cut_size );\t\t\t\t-- [9] - size of single cut file\n}\n;\n\ncreate procedure DB.DBA.RDFXML_FILE_SPLIT_FLUSH (inout app_env any, in can_continue_file integer)\n{\n declare tctr, tcount, total_tcount, cut_size integer;\n declare triples, env, ses any;\n dbg_obj_princ ('DB.DBA.RDFXML_FILE_SPLIT_FLUSH (..., can_continue_file=', can_continue_file, '): file=', app_env[2], ' tcount=', app_env[6], ' total_count=', app_env[8]);\n ses := aref_set_0 (app_env, 1);\n triples := aref_set_0 (app_env, 5);\n tcount := app_env[6];\n env := aref_set_0 (app_env, 7);\n total_tcount := app_env[8];\n cut_size := app_env[9];\n for (tctr := 0; tctr < tcount; tctr := tctr + 1)\n {\n http_ttl_prefixes (env, triples[tctr][0], triples[tctr][1], triples[tctr][2], ses);\n }\n for (tctr := 0; tctr < tcount; tctr := tctr + 1)\n {\n http_ttl_triple (env, triples[tctr][0], triples[tctr][1], triples[tctr][2], ses);\n }\n app_env[6] := 0;\n total_tcount := total_tcount + tcount;\n if ((not can_continue_file) or (total_tcount >= cut_size))\n {\n string_to_file (sprintf (app_env[0], app_env[2]), ses, -2);\n ses := string_output ();\n env := vector (dict_new (16000), 0, '', '', '', 0, 0, 0, 0);\n app_env[2] := app_env[2] + 1;\n total_tcount := 0;\n }\n aset_zap_arg (app_env, 1, ses);\n aset_zap_arg (app_env, 5, triples);\n aset_zap_arg (app_env, 7, env);\n app_env[8] := total_tcount;\n}\n;\n\ncreate procedure DB.DBA.RDFXML_FILE_SPLIT_EV_NEW_BLANK (inout g_iid IRI_ID, inout app_env any, inout res IRI_ID) {\n declare i integer;\n i := app_env[3];\n res := iri_id_from_num (i);\n app_env[3] := i+1;\n}\n;\n\ncreate procedure DB.DBA.RDFXML_FILE_SPLIT_EV_GET_IID (inout uri varchar, inout g_iid IRI_ID, inout app_env any, inout res IRI_ID) {\n res := uri;\n}\n;\n\ncreate procedure DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE (\n inout g_iid IRI_ID, inout s_uri varchar, inout p_uri varchar,\n inout o_uri varchar,\n inout app_env any )\n{\n if (app_env[6] >= 50000)\n DB.DBA.RDFXML_FILE_SPLIT_FLUSH (app_env, 1);\n __box_flags_set (o_uri, 1);\n app_env[5][app_env[6]] := vector (s_uri, p_uri, o_uri);\n app_env[6] := app_env[6]+1;\n}\n;\n\ncreate procedure DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE_L (\n inout g_iid IRI_ID, inout s_uri varchar, inout p_uri varchar,\n inout o_val any, inout o_type varchar, inout o_lang varchar,\n inout app_env any )\n{\n if (app_env[6] >= 50000)\n DB.DBA.RDFXML_FILE_SPLIT_FLUSH (app_env, 1);\n app_env[5][app_env[6]] := vector (s_uri, p_uri, DB.DBA.RDF_MAKE_LONG_OF_TYPEDSQLVAL_STRINGS (o_val, o_type, o_lang));\n app_env[6] := app_env[6]+1;\n}\n;\n\ncreate procedure DB.DBA.RDFXML_FILE_SPLIT (in in_fname varchar, in base varchar, in parse_mode integer, in out_fname_tmpl varchar, in cut_size integer := 100000000)\n{\n declare in_ses, app_env any;\n if (in_fname like '%.rdf.gz' or in_fname like '%.xml.gz')\n in_ses := gz_file_open (in_fname);\n else\n in_ses := file_open (in_fname);\n DB.DBA.RDFXML_FILE_SPLIT_INIT (out_fname_tmpl, cut_size, app_env);\n rdf_load_rdfxml (in_ses, parse_mode,\n '' /* fake graph, UNAME is to avoid copying */,\n vector (\n '',\n 'DB.DBA.RDFXML_FILE_SPLIT_EV_NEW_BLANK',\n 'DB.DBA.RDFXML_FILE_SPLIT_EV_GET_IID',\n 'DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE',\n 'DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE_L',\n '',\n '' ),\n app_env,\n base );\n RDFXML_FILE_SPLIT_FLUSH (app_env, 0);\n}\n;\n\nDB.DBA.RDFXML_FILE_SPLIT ('/demos/uniprot/src/uniparc.rdf', 'http://purl.uniprot.org/uniparc/', 0, '/demos/uniprot/src/uniparc%06d.ttl', 200000);\n\n\n\n---++Related\n\n * [[VirtTipsAndTricksGuide][Virtuoso Tips and Tricks Collection]]\n * [[http://docs.openlinksw.com/virtuoso/rdfsparql.html][Virtuoso Documentation]]\n" . . . "2017-06-13T05:39:41Z" . . . . . . . "VirtTipsAndTricksGuideSplittingUnProt" . . "7505540fd1a7bd2b04553f8b2df53466" . . . "VirtTipsAndTricksGuideSplittingUnProt" . "2017-06-13T05:39:41.258265"^^ . "2017-06-13T05:39:41.258265"^^ . . "2017-06-13T05:39:41Z" . . . "VirtTipsAndTricksGuideSplittingUnProt" . . .