R h2o gsub null pointer - regex

I'm using the h2o package in R and trying to do some data manipulation but having some issues with the sub/gsub functions.
Here's my code:
library(h2o)
# Start cluster
localH2O = h2o.init(nthreads = 2)
# Create data set
dat1.mini <- structure(list(id = c("7927751403363142656", "18236986451472797696",
"5654946373641778176", "14195690822403907584", "1693303484298446848",
"1.1362181921561e+19", "11694645532962195456", "1221431312630614784",
"1987127670789791488", "379819848497418688"), click = c("0",
"0", "0", "0", "0", "0", "0", "1", "0", "0"), hour = c("14102118",
"14102217", "14102812", "14102912", "14102820", "14102401", "14102117",
"14102312", "14102301", "14102414"), C1 = c("1005", "1005", "1005",
"1002", "1005", "1005", "1005", "1005", "1005", "1005"), banner_pos = c("1",
"1", "0", "0", "0", "0", "1", "1", "0", "0"), site_id = c("b7e9786d",
"e151e245", "85f751fd", "ee4c822c", "85f751fd", "85f751fd", "e5c60a05",
"e151e245", "1fbe01fe", "1fbe01fe"), site_domain = c("b12b9f85",
"7e091613", "c4e18dd6", "c4e18dd6", "c4e18dd6", "c4e18dd6", "7256c623",
"7e091613", "f3845767", "f3845767"), site_category = c("f028772b",
"f028772b", "50e219e0", "50e219e0", "50e219e0", "50e219e0", "f028772b",
"f028772b", "28905ebd", "28905ebd"), app_id = c("ecad2386", "ecad2386",
"685d1c4c", "ecad2386", "92f5800b", "f02cb7ab", "ecad2386", "ecad2386",
"ecad2386", "ecad2386"), app_domain = c("7801e8d9", "7801e8d9",
"2347f47a", "7801e8d9", "ae637522", "2347f47a", "7801e8d9", "7801e8d9",
"7801e8d9", "7801e8d9"), app_category = c("07d7df22", "07d7df22",
"8ded1f7a", "07d7df22", "0f2161f8", "f95efa07", "07d7df22", "07d7df22",
"07d7df22", "07d7df22"), device_id = c("a99f214a", "a99f214a",
"a99f214a", "8374cacf", "a99f214a", "8a5908a5", "a99f214a", "a99f214a",
"a99f214a", "a99f214a"), device_ip = c("3214d61e", "d5623936",
"419e166e", "698846d6", "c2d9c2f2", "40817190", "edd10fc1", "e4c6e857",
"05d3adbe", "6929d972"), device_model = c("a0f5f879", "69f9dd0e",
"46a414f4", "12edfe21", "4ffd3a7e", "04f5b394", "779d90c2", "1f0bc64f",
"293291c1", "d787e91b"), device_type = c("1", "1", "1", "0",
"1", "1", "1", "1", "1", "1"), device_conn_type = c("0", "0",
"3", "0", "3", "0", "0", "0", "0", "0"), C14 = c("16208", "20277",
"23224", "17566", "21189", "20633", "19771", "17264", "15703",
"20108"), C15 = c("320", "320", "320", "320", "320", "320", "320",
"320", "320", "320"), C16 = c("50", "50", "50", "50", "50", "50",
"50", "50", "50", "50"), C17 = c("1800", "2281", "2676", "479",
"2424", "2374", "2227", "1872", "1722", "2299"), C18 = c("3",
"3", "0", "3", "1", "3", "0", "3", "0", "2"), C19 = c("167",
"47", "35", "39", "161", "39", "679", "39", "35", "1327"), C20 = c("100077",
"100181", "100176", "100074", "100189", "-1", "100074", "-1",
"-1", "-1"), C21 = c("23", "42", "221", "23", "71", "23", "48",
"23", "79", "52")), .Names = c("id", "click", "hour", "C1", "banner_pos",
"site_id", "site_domain", "site_category", "app_id", "app_domain",
"app_category", "device_id", "device_ip", "device_model", "device_type",
"device_conn_type", "C14", "C15", "C16", "C17", "C18", "C19",
"C20", "C21"), row.names = c(NA, 10L), class = "data.frame")
# Load data to cluster
dat.mini.hex <- as.h2o(localH2O, dat1.mini)
# Attempt to grab substring of first 6 characters from hour column
dat.mini.hex$hr <- h2o.sub('^(.{6}).*$','\\1', dat.mini.hex$hour)
dat.mini.hex$hr <- h2o.gsub('(.+)..','\\1', dat.mini.hex$hour)
All of these attempts result in the following error:
Error in .h2o.__remoteSend(client, .h2o.__PAGE_EXEC2, str = expr) :
http://127.0.0.1:54321/2/Exec2.json returned the following error:
class java.lang.NullPointerException

The error occurs because hour is a numeric column. The function h2o.sub and h2o.gsub do not work with numeric data.
The command str(dat.mini.hex$hour) will show you that hour is a numeric column.
str(dat.mini.hex$hour)
You can convert hour to a factor and save the result in a new column hour2.
dat.mini.hex$hour2 <- as.factor(dat.mini.hex$hour)
Now, you can use h2o.sub. However, I suppose you will not like the result...
h2o.sub('^(.{6}).*$','\\1', dat.mini.hex$hour2)
# hour2
# 1 \\1
# 2 \\1
# 3 \\1
# 4 \\1
# 5 \\1
# 6 \\1
As you can see, h2o.sub uses \\1 literally but not for the first matching group. This behaviour is in contrast to base R's sub.
You can change your regex and replace the characters after the first six ones with the empty string.
h2o.sub('(?<=^.{6}).*$','', dat.mini.hex$hour2)
# hour2
# 1 141021
# 2 141022
# 3 141028
# 4 141029
# 5 141028
# 6 141024
Here, (?<=^.{6}) is a positive lookbehind. It matches the position that is preceded by the beginning of the string and the first 6 digits.

Related

Scala Regex: matching a non single character input

For instance the function:
val reg = "[^ ]".r
takes in an input that would match with every character except for the empty space character.
and:
def matchReg(line: String): List[String] = reg.findAllIn(line).toList
Converts the input into a list.
However how would I edit this so that it would match with a non-single character input. Since it seems as though this splits the input such as "14" into "1" and "4" when the values of the regular expression are turned into a list. If the input is "14" I want it to output "14" rather than it being split. Thank you.
EDIT: I have written test cases to explain the type of output I am looking for
"The match" should "take list" in {
assert(matchReg("(d e f)") == List("(", "d", "e", "f", ")"))
}
it should "take numbers" in {
assert(matchReg("(12 45 -9 347 4)") == List("(", "12", "45", "-9", "347", "4", ")"))
}
it should "take operators" in {
assert(matchReg("(- 7 (* 8 9))") == List("(", "-", "7", "(", "*", "8", "9", ")", ")"))
}
With the following case, "take list" and "take operators" passes successfully if I use:
val reg = "[^ ]".r
However "take numbers" does not pass since numbers such as "347" are being split into "3" "4" and "7", when I want them to register as one single number.
This should work for you
val reg = """[^ \(\)]+|\(|\)""".r
You should add some other alternatives if you want to support also [ ] , { } or other operators
# matchReg("(d e f)")
res8: List[String] = List("(", "d", "e", "f", ")")
# matchReg("(12 45 -9 347 4)")
res9: List[String] = List("(", "12", "45", "-9", "347", "4", ")")
# matchReg("(- 7 (* 8 9))")
res10: List[String] = List("(", "-", "7", "(", "*", "8", "9", ")", ")")

autofilling a dict python 2.x

I'm quite new to Python and programming in general, so apologies if this is quite basic or has been asked and answered before. Here is a sample of the data I'm working with:
{
"homeTeam": {
"formation": [
"4",
"4",
"2"
],
"lineupsSorted": [
{
"player": {
"name": "Scott P. Brown",
"slug": "scott-p-brown",
"shortName": "S. P. Brown",
"id": 19889,
"hasImage": true
},
"position": 1,
"shirtNumber": 1,
"substitute": false,
"positionName": "Goalkeeper",
"positionNameshort": "G",
"captain": false
},
{
"player": {
"name": "Carl Winchester",
"slug": "carl-winchester",
"shortName": "C. Winchester",
"id": 110785,
"hasImage": true
},
"position": 2,
"shirtNumber": 27,
"substitute": false,
"positionName": "Midfielder",
"positionNameshort": "M",
"captain": false
},
I am looking to automate populating defined names as I have done manually here:
hometeamPositions =['Goalkeeper','Midfielder','Defender','Defender','Defender','Midfielder','Midfielder','Midfielder','Midfielder','Forward','Forward','Goalkeeper','Defender','Defender','Midfielder','Midfielder','Forward','Forward']
hometeamPlayers = ['S. P. Brown','C. Winchester','M. Onariase','W.
Boyle','J. Cranston','H. Pell','J. Rowe','K. Storer','B. Waters','D.
Wright','D. Holman','R. Lovett','J. Barthram','T. Plavotic','J.
Munns','L. Davis','K. Wootton','J. Dayton']
As I will be repeating this process many hundreds of times with different data (same structure) I was wondering if anyone could give me some tips on automatically building these ranges?
Thanks,
Peter
I'm not sure I understood what is the problem you are trying to solve but I'll try to help.
Assuming you have a dictionary team_dict and you want to create 2 list: hometeamPositions and hometeamPlayers you can use the following code:
hometeamPlayers = []
hometeamPositions = []
for player_dict in teams_dict['homeTeam']['lineupsSorted']:
hometeamPlayers.append(player_dict['player']['shortName'])
hometeamPositions.append(player_dict['positionName'])
The output on your example will be:
hometeamPlayers = ['S. P. Brown', 'C. Winchester']
hometeamPositions = ['Goalkeeper', 'Midfielder']

Regex to find State and Zip from Address

Trying to make regex that can get state from address
1- 1234 Bellaire Blvd, Suite 123, Houston, TX 77036
2- 1234 BELLAIRE BL #123, HOUSTON, TX 77036
I have this for state
\w{2}(?=\s\d{1,5})
And this for Zip
(?<=\w{2}\s)\d{5}
FOR STATE
In 1st case above regex is returning "te" from "Suite" and TX for state which is correct
However, in 2nd case it is returning nothing
FOR ZIP
77036 is returned in 1st case and null is returned in 2nd case
I don't think regular expressions are the best way to do this. Rather I'd use an API to parse the address into it's components. You will need state_abbreviation and you're sorted. Example response:
[
{
"input_index": 0,
"candidate_index": 0,
"delivery_line_1": "1 Santa Claus Ln",
"last_line": "North Pole AK 99705-9901",
"delivery_point_barcode": "997059901010",
"components": {
"primary_number": "1",
"street_name": "Santa Claus",
"street_suffix": "Ln",
"city_name": "North Pole",
"state_abbreviation": "AK",
"zipcode": "99705",
"plus4_code": "9901",
"delivery_point": "01",
"delivery_point_check_digit": "0"
},
"metadata": {
"record_type": "S",
"zip_type": "Standard",
"county_fips": "02090",
"county_name": "Fairbanks North Star",
"carrier_route": "C004",
"congressional_district": "AL",
"rdi": "Commercial",
"elot_sequence": "0001",
"elot_sort": "A",
"latitude": 64.75233,
"longitude": -147.35297,
"precision": "Zip8",
"time_zone": "Alaska",
"utc_offset": -9,
"dst": true
},
"analysis": {
"dpv_match_code": "Y",
"dpv_footnotes": "AABB",
"dpv_cmra": "N",
"dpv_vacant": "N",
"active": "Y",
"footnotes": "L#"
}
},
{
"input_index": 1,
"candidate_index": 0,
"addressee": "Apple Inc",
"delivery_line_1": "1 Infinite Loop",
// truncated for brevity
}
]
Hope that helped.
You can match against ', ([A-Z]{2}) ' the state will then be the subpattern matched by the parentheses. In python it would look like this.
import re
s1 = "1- 1234 Bellaire Blvd, Suite 123, Houston, TX 77036"
s2 = "2- 1234 BELLAIRE BL #123, HOUSTON, TX 77036"
m = re.search(', ([A-Z]{2}) ', s1)
print(m.group(1))

Python list and for loop

I'm expecting this code to print spade:A spade:2 and so on until heart:K.
But it only does heart:A to heart:K.
How should I do it?
symbols = ["spade", "clover", "diamond", "heart"]
numbers = ["A", "2", "3", "4", "5", "6", "7", "8", "9", "10", "J", "Q", "K"]
cards = {}
for num in numbers:
for symbol in symbols:
cards[num] = symbol
print cards
Use your itertools toolbox
import itertools
symbols = ["spade", "clover", "diamond", "heart"]
numbers = ["A", "2", "3", "4", "5", "6", "7", "8", "9", "10", "J", "Q", "K"]
combinations = itertools.product(symbols, numbers)
cards = ["{}:{}".format(suit, rank) for suit,rank in combinations]
This will give you the list:
['spade:A',
'spade:2',
'spade:3',
'spade:4',
'spade:5',
'spade:6',
'spade:7',
'spade:8',
'spade:9',
'spade:10',
'spade:J',
'spade:Q',
'spade:K',
'clover:A',
'clover:2',
'clover:3',
'clover:4',
'clover:5',
'clover:6',
'clover:7',
'clover:8',
'clover:9',
'clover:10',
'clover:J',
'clover:Q',
'clover:K',
'diamond:A',
'diamond:2',
'diamond:3',
'diamond:4',
'diamond:5',
'diamond:6',
'diamond:7',
'diamond:8',
'diamond:9',
'diamond:10',
'diamond:J',
'diamond:Q',
'diamond:K',
'heart:A',
'heart:2',
'heart:3',
'heart:4',
'heart:5',
'heart:6',
'heart:7',
'heart:8',
'heart:9',
'heart:10',
'heart:J',
'heart:Q',
'heart:K']
The problem is that you are not iterating the right way and thus you are not appending in the list. The right way to do it is
symbols = ["spade", "clover", "diamond", "heart"]
numbers = ["A", "2", "3", "4", "5", "6", "7", "8", "9", "10", "J", "Q", "K"]
cards = []
for j in range(len(symbols)):
for i in range(len(numbers)):
cards.append(str(symbols[j]+':'+str(numbers[i])))
print cards
with output:
['spade:A', 'spade:2', 'spade:3', 'spade:4', 'spade:5', 'spade:6', 'spade:7', 'spade:8',
'spade:9', 'spade:10', 'spade:J', 'spade:Q', 'spade:K', 'clover:A', 'clover:2',
'clover:3', 'clover:4', 'clover:5', 'clover:6', 'clover:7', 'clover:8', 'clover:9',
'clover:10', 'clover:J', 'clover:Q', 'clover:K', 'diamond:A', 'diamond:2', 'diamond:3',
'diamond:4', 'diamond:5', 'diamond:6', 'diamond:7', 'diamond:8', 'diamond:9', 'diamond:10',
'diamond:J', 'diamond:Q', 'diamond:K', 'heart:A', 'heart:2', 'heart:3', 'heart:4',
'heart:5', 'heart:6', 'heart:7', 'heart:8', 'heart:9', 'heart:10', 'heart:J', 'heart:Q', 'heart:K']
Made with Ipython Notebook in python 2.7
Hope it helps.
You are iterating the symbols just fine but when you are going over the numbers in the second loop, you are actually replacing the values set by the previous loop hence you only have values from the last loop left and everything is replaced. This means cards["A"] value is set 4 times in the loop and the last for the "heart" is retained. The same thing is happening for all the other indexes.

how to show data in row on Stacked bar chat by using MS Charts Control

I need to show the data from a SQL query as shown below on stacked bar chart by using MS Chart control on ASPX page
Where I have to show the chart like this
</asp:Chart>
You really should show some code if you want help, but here's a simple vb.net solution.
I added a chart to the page and then ran the following code to produce the below chart
Create your datatable - you can drop this and connect with SQL
Dim t As New DataTable
t.Columns.Add("Risk Categories")
t.Columns.Add("High Impact Risks")
t.Columns.Add("Medium Impact Risks")
t.Columns.Add("Low Impact Risks")
t.Columns.Add("No Impact Risks")
t.Rows.Add("Compliance,Law,Legislation", "4", "1", "0", "5")
t.Rows.Add("Construction", "5", "1", "1", "0")
t.Rows.Add("Design", "3", "1", "0", "0")
t.Rows.Add("Financial", "6", "0", "0", "2")
t.Rows.Add("Human Resources", "2", "0", "0", "10")
t.Rows.Add("Information & Communication", "1", "0", "0", "1")
t.Rows.Add("Interface", "1", "0", "0", "0")
t.Rows.Add("Logistic", "0", "1", "0", "6")
t.Rows.Add("Management", "0", "1", "0", "0")
t.Rows.Add("Planning", "3", "0", "0", "1")
Add the points to the new chart
Chart1.Series(0).ChartType = SeriesChartType.StackedBar
With Chart1.Series(0)
.Name = "High Impact Risks"
.Points.DataBind(t.DefaultView, "Risk Categories", "High Impact Risks", Nothing)
End With
Chart1.Series.Add("Medium Impact Risks")
Chart1.Series(1).ChartType = SeriesChartType.StackedBar
With Chart1.Series(1)
.Name = "Medium Impact Risks"
.Points.DataBind(t.DefaultView, "Risk Categories", "Medium Impact Risks", Nothing)
End With
Chart1.Series.Add("Low Impact Risks")
Chart1.Series(2).ChartType = SeriesChartType.StackedBar
With Chart1.Series(2)
.Points.DataBind(t.DefaultView, "Risk Categories", "Low Impact Risks", Nothing)
End With
Chart1.Series.Add("No Impact Risks")
Chart1.Series(3).ChartType = SeriesChartType.StackedBar
With Chart1.Series(3)
.Points.DataBind(t.DefaultView, "Risk Categories", "No Impact Risks", Nothing)
End With
''Show All Categories on RHS axis
Chart1.ChartAreas(0).AxisX.Interval = 1
Chart1.ChartAreas(0).AxisX.LabelAutoFitStyle = LabelAutoFitStyles.DecreaseFont
''Move legend to bottom and center
Chart1.Legends(0).Docking = Docking.Bottom
Chart1.Legends(0).Alignment = StringAlignment.Center