Introduction
I have a vector entities containing 44 million names. I want to split it into 4 parts and process each part in parallel. Class Freebase contains the function loadData() which is used to split the vector and call function multiThread in order to do the processing.
loadEntities()reads a text file containing the names. I didn't put the implementation in the class because it's not importantloadData()splits the vectorentitiesthat was initialized in the constructor into 4 parts and adds every part thevector<thread> threadsas follows:
threads.push_back(thread(&Freebase::multiThread, this, i, i + right, ref(data)));
- multiThread is the function where I process the files
iandi+rightare the indices used in the for loop of multithread to loop through entitiesreturnValuesis a subfunction ofmultiThreadand is used to call an external function.
Problem
cout <<"Entity " << entities[i] << endl; is showing the following results:
- Entity m.0rzf6wv (ok)
- Entity m.0rzf70 (ok)
- Entity m.068s4h9 m.0n_k8bz (WRONG)
- Entity Entity m.068s5_1 (WRONG)
The last 2 outputs are wrong. The output should be:
Entity namenotentity entity namenorentity name name
This is causing a segmentation fault when the input is being sent to function returnValues. How can I solve it?
Source Code
#ifndef FREEBASE_H
#define FREEBASE_H
class Freebase
{
public:
Freebase(const std::string &, const std::string &, const std::string &, const std::string &);
void loadData();
private:
std::string _serverURL;
std::string _entities;
std::string _xmlFile;
void multiThread(int,int, std::vector<std::pair<std::string, std::string>> &);
//private data members
std::vector<std::string> entities;
};
#endif
#include "Freebase.h"
#include "queries/SparqlQuery.h"
Freebase::Freebase(const string & url, const string & e, const string & xmlFile, const string & tfidfDatabase):_serverURL(url), _entities(e), _xmlFile(xmlFile), _tfidfDatabase(tfidfDatabase)
{
entities = loadEntities();
}
void Freebase::multiThread(int start, int end, vector<pair<string,string>> & data)
{
string basekb = "PREFIX basekb:<http://rdf.basekb.com/ns/> ";
for(int i = start; i < end; i++)
{
cout <<"Entity " << entities[i] << endl;
vector<pair<string, string>> description = returnValues(basekb + "select ?description where {"+ entities[i] +" basekb:common.topic.description ?description. FILTER (lang(?description) = 'en') }");
string desc = "";
for(auto &d: description)
{
desc += d.first + " ";
}
data.push_back(make_pair(entities[i], desc));
}
}
void Freebase::loadData()
{
vector<pair<string, string>> data;
vector<thread> threads;
int Size = entities.size();
//split database into 4 parts
int p = 4;
int right = round((double)Size / (double)p);
int left = Size % p;
float totalduration = 0;
vector<pair<int, int>> coordinates;
int counter = 0;
for(int i = 0; i < Size; i += right)
{
if(i < Size - right)
{
threads.push_back(thread(&Freebase::multiThread, this, i, i + right, ref(data)));
}
else
{
threads.push_back(thread(&Freebase::multiThread, this, i, Size, ref(data)));
}
}//end outer for
for(auto &t : threads)
{
t.join();
}
}
vector<pair<string, string>> Freebase::returnValues(const string & query)
{
vector<pair<string, string>> data;
SparqlQuery sparql(query, _serverURL);
string result = sparql.retrieveInformations();
istringstream str(result);
string line;
//skip first line
getline(str,line);
while(getline(str, line))
{
vector<string> values;
line.erase(remove( line.begin(), line.end(), '\"' ), line.end());
boost::split(values, line, boost::is_any_of("\t"));
if(values.size() == 2)
{
pair<string,string> fact = make_pair(values[0], values[1]);
data.push_back(fact);
}
else
{
data.push_back(make_pair(line, ""));
}
}
return data;
}//end function