webcrawler link extraction problem - c++

im writing a simple webcrawler. the problem is, with the link extraction.
i uses the cpp-netlib with boost. here a few lines of my CLink Class.
CLink::CLink(const CLink& father, const std::string& relUrl )
{
uri = relUrl;
boost::network::uri::uri instance(relUrl);
boost::network::uri::uri instanceFather(father.uri);
if ( (valid = boost::network::uri::is_valid(instance)) == 1)
{
scheme = boost::network::uri::scheme(instance);
user_info = boost::network::uri::user_info(instance);
host = boost::network::uri::host(instance);
port = boost::network::uri::port(instance);
path = boost::network::uri::path(instance);
query = boost::network::uri::query(instance);
fragment = boost::network::uri::fragment(instance);
uri = scheme;
uri += "://";
uri += host;
uri += path;
}
else
{
if ( (valid = boost::network::uri::is_valid(instanceFather)) == 1)
{
scheme = boost::network::uri::scheme(instanceFather);
user_info = boost::network::uri::user_info(instanceFather);
host = boost::network::uri::host(instanceFather);
port = boost::network::uri::port(instanceFather);
path = boost::network::uri::path(instance);
query = boost::network::uri::query(instance);
fragment = boost::network::uri::fragment(instance);
uri = scheme;
uri += "://";
uri += host;
uri += path;
}
}
};
CLink::CLink( const std::string& _url )
{
uri = _url;
boost::network::uri::uri instance(_url);
if ( (valid = boost::network::uri::is_valid(instance) ) == 1)
{
scheme = boost::network::uri::scheme(instance);
user_info = boost::network::uri::user_info(instance);
host = boost::network::uri::host(instance);
port = boost::network::uri::port(instance);
path = boost::network::uri::path(instance);
query = boost::network::uri::query(instance);
fragment = boost::network::uri::fragment(instance);
uri = scheme;
uri += "://";
uri += host;
uri += path;
}
else
std::cout << "err " << std::endl;
};
the links from the webpage i took with the htmlcxx lib. i took the HTML::Node and normalize them wih the boost filesystem.
if ( url.find("http://") == std::string::npos)
{
std::string path = link.get_path() + url;
url = link.get_host() + path;
boost::filesystem::path result;
boost::filesystem::path p(url);
for(boost::filesystem::path::iterator it=p.begin(); it!=p.end(); ++it)
{
if(*it == "..")
{
if(boost::filesystem::is_symlink(result) )
result /= *it;
else if(result.filename() == "..")
result /= *it;
else
result = result.parent_path();
}
else if(*it == ".")
{
// Ignore
}
else
{
// Just cat other path entries
result /= *it;
}
}
url = "http://" + result.string();
}
return ret;
Now the problem is.
i try to fetch http://www.wikipedia.de/ and i get the urls like
properties
http://wikimedia.de/wiki/Vereinszeitung
... ...
and on the site http://wikimedia.de/wiki/Vereinszeitung there is a link like /wiki/vereinsatzung
so often i get links like
http://wikimedia.de/wiki/Vereinszeitung/wiki/Freies_Wissen
does someone have a idee?

You need to have a special case for absolute links (those that start with /).
If the href starts with /, then the resulting link should be (using the terms from The URI template which come from the RFC):
[scheme]://[authority][what you got in href]
What you are currently constructing is:
[scheme]://[authority][path][what you got in href]
So you're duplicating the path information.
So if link.get_path() starts with /, you should simply change:
std::string path = link.get_path() + url;
url = link.get_host() + path; // this is incorrect btw, missing the [port]
to
url = link.get_host() + ":" + link.get_port() + url;
It would probably be cleaner to do the path normalization on the path only, not on the URL (i.e. add host:port after normalizing the path).
[And I think your code will fail if it encounters an https link.]

Related

Sitecore.Analytics.Tracker.Current is null when invoked through a pipeline

I need to redirect based on the country location the user is trying to access. For example when user trying to access http://www.example.com/ from china my site should as http://www.example.com/zh. I am checking using the sitecore tracker in pipeline process to get the country code using the below method.
public void Process(HttpRequestArgs args)
{
Assert.ArgumentNotNull(args, "args");
if (HttpContext.Current == null
|| Context.Site == null
////TODO: || Sitecore.Context.PageMode...
|| Context.Database == null || Context.Site.Name == "shell" || !this._sites.Contains(Context.Site.Name))
{
return;
}
// contains path including language and query string
// (not anchor name), but not hostname.
// We can use this to add the language back into the path.
string rawPath = Sitecore.Context.RawUrl;
if (!rawPath.StartsWith("/sitecore") && !rawPath.StartsWith("/" + Sitecore.Context.Language.Name + "/") && !rawPath.StartsWith("/" + Sitecore.Context.Language.Name) && !rawPath.StartsWith("/default.aspx"))
{
string langCode = "";
if(!string.IsNullOrEmpty(GeoIPUtils.GetUserGeoIP()))
{
try
{
string country = GeoIPUtils.GetUserGeoIP();;
if (country.Trim().ToUpper() == "China".ToUpper())
langCode = "zh";
else if (country.Trim().ToUpper() == "Japan".ToUpper())
langCode = "jp";
else if (country.Trim().ToUpper() == "Thailand".ToUpper())
langCode = "th";
else
langCode = "en";
}
catch(Exception)
{
langCode = "en";
}
}
else
{
langCode = HttpContext.Current.Request.Cookies["avc#lang"].Value.ToString();
}
if (!string.IsNullOrEmpty(langCode))
{
Language language = null;
if (Language.TryParse(langCode, out language))
{
//then try to get the language item id from the language or two letter iso code
ID langID = LanguageManager.GetLanguageItemId(language, Sitecore.Context.Database);
if (!ID.IsNullOrEmpty(langID))
{
//sometimes the language found is slightly different than official language item used in SC
language = LanguageManager.GetLanguage(language.CultureInfo.TwoLetterISOLanguageName);
if (Context.Item != null)
{
List<string> availableLangs = LanguagesWithContent(Context.Item);
if (availableLangs != null && availableLangs.Count > 0 && !availableLangs.Contains(language.CultureInfo.TwoLetterISOLanguageName.ToString()))
{
langCode = availableLangs.FirstOrDefault().ToString();
}
}
else
{
langCode = "en";
}
}
else
{
langCode = "en";
}
}
}
HttpContext.Current.Response.RedirectPermanent("/" + (String.IsNullOrEmpty(langCode) ? Sitecore.Context.Language.Name : langCode) + rawPath);
}
}
Below is the GetUserGeoIP function
public static string GetUserGeoIP()
{
string countryCode = "";
try
{
countryCode = Sitecore.Analytics.Tracker.Current.Interaction.GeoData.Country;
}
catch(Exception ex)
{
Log.Error("GetUserGeoIP Error: " + ex.Message + " Source: " + ex.Source + " Stack Trace :" + ex.StackTrace + " Inner Ex : " + ex.InnerException, ex);
countryCode = "GB";
}
if (!string.IsNullOrEmpty(countryCode))
{
var countryItem = ISO3166.FromAlpha2(countryCode);
if (countryItem != null)
return countryItem.Name;
}
return "Other";
}
But I am getting an below exception
7904 10:43:25 ERROR Cannot create tracker.
Exception: System.InvalidOperationException
Message: session is not initialized
Source: Sitecore.Analytics
at Sitecore.Analytics.Data.HttpSessionContextManager.GetSession()
at Sitecore.Analytics.Pipelines.EnsureSessionContext.EnsureContext.Process(InitializeTrackerArgs args)
at (Object , Object[] )
at Sitecore.Pipelines.CorePipeline.Run(PipelineArgs args)
at Sitecore.Analytics.DefaultTracker.EnsureSessionContext()
at Sitecore.Analytics.Pipelines.CreateTracker.GetTracker.Process(CreateTrackerArgs args)
at (Object , Object[] )
at Sitecore.Pipelines.CorePipeline.Run(PipelineArgs args)
at Sitecore.Analytics.Tracker.Initialize()
Note: The same GetUserGeoIP method is used in API which gets the correct countryName. I am using Sitecore.NET 8.0 (rev. 151127) version
Any help on this highly appreciated
Your processor is probably too soon in the pipeline(s). You can find an overview of the request pipelines here: http://sitecoreskills.blogspot.be/2015/02/a-sitecore-8-request-from-beginning-to.html
You should put your processor after the tracker has been initialized and the geo data has been fetched. I did something similar a while ago and placed my processor in the startAnalytics pipeline.
Fetching the geo data is async so that might (will) give issues when doing this in the first request pipeline. Read this article to tackle that issue by calling UpdateGeoIpData with a timespan.

MongoDB C driver: how to use regex query collection?

for example, In MySQL the query like this:
select name from test_tab where id = 1 and name like 'test%';
I want to translate this query in mongodb by mongodb C driver, so I write the code like this, but all don't work(the searchDoc() function is okay.
string strName = "";
bson_t *cond = bson_new();
BSON_APPEND_INT32(cond, "id", 1);
if (strlen(name.c_str()) > 0)
{
strName = "/^" + name + "/";
bson_t *child;
child = bson_new();
bson_append_document_begin(cond, "name", -1, child);
BSON_APPEND_UTF8(child, "$regex", strName.c_str());
bson_append_document_end(cond, child);
}
mongoc_cursor_t *cursor(NULL);
int iRet = searchDoc("db", "test_tab", cond, cursor);
if (iRet < 0)
{
bson_destroy(cond);
mongoc_cursor_destroy(cursor);
return -1;
}
and
string strName = "";
bson_t *cond = bson_new();
BSON_APPEND_INT32(cond, "id", 1);
if (strlen(name.c_str()) > 0)
{
strName = "/^" + name + ".*/";
BSON_APPEND_REGEX(cond, "name", strName.c_str(), "i");
}
mongoc_cursor_t *cursor(NULL);
int iRet = searchDoc("db", "test_tab", cond, cursor);
if (iRet < 0)
{
bson_destroy(cond);
mongoc_cursor_destroy(cursor);
return -1;
}
How can I build the regex pattern to query record work okay?
Thank you
I seach this page: https://jira.mongodb.org/browse/CDRIVER-206,
there is a important tips: "You do not need to include the forward-slash when building a regex."
so I delete the forward-slash, my code
strName = "/^" + name + ".*/";
BSON_APPEND_REGEX(cond, "name", strName.c_str(), "i");
change to
strName = "/^" + name + ".*/";
BSON_APPEND_REGEX(cond, "name", strName.c_str(), "i");
It can work. My problem has been solved.

Detect USB devices event

I made a console application which detects plugin and plugout events for all type of usb devices. but I wanted some filteration in it like I wanted to detect only webcams . This was done by using GUID class. The class for webcam is 'Image' class with GUID "{6bdd1fc5-810f-11d0-bec7-08002be2092f}" .The problem is that this 'Image' class is also used for scanners and I dont want to detect scanners.The code is given below:
static void Main(string[] args)
{
WqlEventQuery weqQuery = new WqlEventQuery();
weqQuery.EventClassName = "__InstanceOperationEvent";
weqQuery.WithinInterval = new TimeSpan(0, 0, 3);
weqQuery.Condition = #"TargetInstance ISA 'Win32_PnPEntity'";
ManagementEventWatcher m_mewWatcher = new ManagementEventWatcher(weqQuery);
m_mewWatcher.EventArrived += new EventArrivedEventHandler(m_mewWatcher_EventArrived);
m_mewWatcher.Start();
Console.ReadLine();
}
static void m_mewWatcher_EventArrived(object sender, EventArrivedEventArgs e)
{
bool bUSBEvent = false;
string deviceCaption = "";
string deviceType = "";
foreach (PropertyData pdData in e.NewEvent.Properties)
{
try
{
ManagementBaseObject mbo = (ManagementBaseObject)pdData.Value;
if (mbo != null)
{
foreach (PropertyData pdDataSub in mbo.Properties)
{
Console.WriteLine(pdDataSub.Name + " = " + pdDataSub.Value);
if (pdDataSub.Name == "Caption")
{
deviceCaption = pdDataSub.Value.ToString();
}
if (pdDataSub.Name == "ClassGuid" && pdDataSub.Value.ToString() == "{6bdd1fc5-810f-11d0-bec7-08002be2092f}")
{
bUSBEvent = true;
deviceType = "Image";
}
}
if (bUSBEvent)
{
if (e.NewEvent.ClassPath.ClassName == "__InstanceCreationEvent")
{
Console.WriteLine("A " + deviceType + " device " + deviceCaption + " was plugged in at " + DateTime.Now.ToString());
}
else if (e.NewEvent.ClassPath.ClassName == "__InstanceDeletionEvent")
{
Console.WriteLine("A " + deviceType + " device " + deviceCaption + " was plugged out at " + DateTime.Now.ToString());
}
}
}
}
catch (Exception ex)
{
}
}
}
for references check this link
I waited but no body answered this question so, after seeing all properties of ManagementBaseObject I found that there is a property named Service which is different for scanners. In scanners the value of Service property is usbscan while in cameras it is usbvideo.
eg.
you can do something like this
if (mbo.Properties["Service"].Value.ToString() == "usbscan")
{
//then it means it is a scanner
}
else
{
//then it means it is a camera
}
note: The main question was that how can we differentiate between a scanner and a webcam because they both use same GUID.

QString parse system variables

How I should parse QString, which contains system variables ?What I want:
QString path = "%WINDIR%\\System32\\";
QString output = parse(path);
QDebug()<<output; \\ output is "C:\\Windows\\System32\\"
I think you want something like this:
// Untested
QString parse(QString str)
{
int pos = 0;
QRegExp rx("%([^%]+)%"); // Match env var between two '%'
rx.setMinimal(true);
while((pos = rx.indexIn(str, pos)) != -1)
{
// Replace env var
QString capture = rx.cap(1);
QString replacement = getenv(capture.toAscii());
str.replace("%" + capture + "%", replacement);
// Skip env var + two '%'
pos += rx.matchedLength() + 2;
}
return str;
}
QString path = parse("%WINDIR%\\System32");
I think, this is what you looking for. Please try this
QString windir = getenv ("WINDIR"); // Expanded
if (windir.isEmpty()) {
fprintf(stderr, "Generator requires WINDIRto be set\n");
}
windir += "\\System32";
qDebug()<<windir;

List of windows users remotely

I would like to know how to retrieve the list of users that are logged onto a Remote machine. I can do it with qwinsta /server:xxxx, but would like to do it in C#.
check out wmi in .net under system.management.
something like:
ConnectionOptions conn = new ConnectionOptions();
conn.Authority = "ntdlmdomain:NAMEOFDOMAIN";
conn.Username = "";
conn.Password = "";
ManagementScope ms = new ManagementScope(#"\\remotecomputer\root\cimv2", conn);
ms.Connect();
ObjectQuery qry = new ObjectQuery("select * from Win32_ComputerSystem");
ManagementObjectSearcher search = new ManagementObjectSearcher(ms, qry);
ManagementObjectCollection return = search.Get();
foreach (ManagementObject rec in return)
{
Console.WriteLine("Logged in user: " + rec["UserName"].ToString());
}
You may not need the ConnectionOptions...
I ended up using the qwinsta /server:server1 command from C# -- it was much easier
thanks Ken
All this checks all 8 servers at the same time -- I put the result in sql server
but you can do what ever you want
query_citrix.bat script
cd C:.......\bin\citrix_boxes
qwinsta -server:servername or ip > servername.txt
string sAppCitrixPath = Application.StartupPath.ToString() + "\\citrix_boxes\\";
//Run Script for current citrix boxes
Process proc = new Process();
ProcessStartInfo si = new ProcessStartInfo();
si.FileName = sAppCitrixPath + "query_citrix.bat";
proc.StartInfo = si;
proc.Start();
proc.WaitForExit();
int exitCode = proc.ExitCode;
proc.Close();
if (exitCode == 0)
{
//Execute update who is on the Citrix_Boxes Currently
DirectoryInfo dic = new DirectoryInfo(sAppCitrixPath);
FileInfo[] fic = dic.GetFiles("*.txt");
for (int i = 0; i < fic.Length; i++)
{
ParseQWinStaServerFile(fic[i].FullName.ToString(), fic[i].Name.ToString().ToUpper().Replace(".TXT",""));
File.Delete(fic[i].FullName.ToString());
}
}
private void ParseQWinStaServerFile(string sLocation,string sServer)
{
using (StreamReader sr = File.OpenText(sLocation))
{
string sRecord = String.Empty;
char[] cSep = new char[] {' '};
bool bFirst = true;
while ((sRecord = sr.ReadLine()) != null)
{
if (bFirst == false)
{
string[] items = sRecord.Split(cSep, StringSplitOptions.RemoveEmptyEntries);
//Make sure all columns are present on the split for valid records
if (sRecord.Substring(19, 1) != " ") // check position of user id to see if it's their
{
//Send the user id and server name where you want to.
//here is your user
id = items[1].ToString().ToLower().Trim()
//here is your server
};
}
else
{
bFirst = false;
}
}
}
}