How to create an UDF with two inputs in pyspark - python-2.7

I am new to pyspark and I am trying to create a simple udf that must take two input columns, check if the second column has a blank space and if so, split the first one into two values and overwritte the original columns. This is what I have done:
def split(x, y):
if x == "EXDRA" and y == "":
return ("EXT", "DCHA")
if x == "EXIZQ" and y == "":
return ("EXT", "IZDA")
udf_split = udf(split, ArrayType())
df = df \
.withColumn("x", udf_split(df['x'], df['y'])[1]) \
.withColumn("y", udf_split(df['x'], df['y'])[0])
But when I run this code I get the following error:
File "<stdin>", line 1, in <module>
TypeError: __init__() takes at least 2 arguments (1 given)
What am I doing wrong?
Thank you,
Álvaro

I'm not sure about what you are trying to do, but this is how I would do it from what I understood :
from pyspark.sql.types import *
from pyspark.sql.functions import udf, col
def split(x, y):
if x == "EXDRA" and y == "":
return ("EXT", "DCHA")
if x == "EXIZQ" and y == "":
return ("EXT", "IZDA")
schema = StructType([StructField("x1", StringType(), False), StructField("y1", StringType(), False)])
udf_split = udf(split, schema)
df = spark.createDataFrame([("EXDRA", ""), ("EXIZQ", ""), ("", "foo")], ("x", "y"))
df.show()
# +-----+---+
# | x| y|
# +-----+---+
# |EXDRA| |
# |EXIZQ| |
# | |foo|
# +-----+---+
df = df \
.withColumn("split", udf_split(df['x'], df['y'])) \
.withColumn("x", col("split.x1")) \
.withColumn("y", col("split.y1"))
df.printSchema()
# root
# |-- x: string (nullable = true)
# |-- y: string (nullable = true)
# |-- split: struct (nullable = true)
# | |-- x1: string (nullable = false)
# | |-- y1: string (nullable = false)
df.show()
# +----+----+----------+
# | x| y| split|
# +----+----+----------+
# | EXT|DCHA|[EXT,DCHA]|
# | EXT|IZDA|[EXT,IZDA]|
# |null|null| null|
# +----+----+----------+

Guess you have to define your udf as:
udf_split = udf(split, ArrayType(StringType()))

Related

How to extract values from a column and have it as float in pyspark?

I have a pyspark dataframe that visually looks like the following. I want the column to hold float values only. Please note, currently the values have square bracket around it.
from pyspark.sql.types import StructType,StructField
from pyspark.sql.types import StringType, IntegerType, ArrayType
data = [
("Smith","OH","[55.5]"),
("Anna","NY","[33.3]"),
("Williams","OH","[939.3]"),
]
schema = StructType([
StructField('name', StringType(), True),
StructField('state', StringType(), True),
StructField('salary', StringType(), True)
])
df = spark.createDataFrame(data = data, schema= schema)
df.show(truncate=False)
Input:
+--------+-----+-------+
|name |state|salary |
+--------+-----+-------+
|Smith |OH |[55.5] |
|Anna |NY |[33.3] |
|Williams|OH |[939.3]|
+--------+-----+-------+
And the output should look like,
+--------+-----+------------------+
|name |state|float_value_salary|
+--------+-----+------------------+
|Smith |OH |55.5 |
|Anna |NY |33.3 |
|Williams|OH |939.3 |
+--------+-----+------------------+
Thank you for any help.
You can trim the square brackets and cast to float:
import pyspark.sql.functions as F
df2 = df.withColumn('salary', F.expr("float(trim('[]', salary))"))
df2.show()
+--------+-----+------+
| name|state|salary|
+--------+-----+------+
| Smith| OH| 55.5|
| Anna| NY| 33.3|
|Williams| OH| 939.3|
+--------+-----+------+
Or you can use from_json to parse it as an array of float, and get the first array element:
df2 = df.withColumn('salary', F.from_json('salary', 'array<float>')[0])
You can use regex:
import pyspark.sql.functions as F
df.select(
F.regexp_extract('salary', '([\d\.]+)', 1).cast('float').alias('salary')
).show()
Output:
+------+
|salary|
+------+
| 55.5|
| 33.3|
| 939.3|
+------+
you need to parse the string to a float array using a UDF and then you can explode the array to get the singular value within the array.
the program would be as follows :
import json
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType
def parse_value_from_string(x):
res = json.loads(x)
return res
parse_float_array = F.udf(parse_value_from_string, ArrayType(FloatType()))
df = df.withColumn('float_value_salary',F.explode(parse_float_array(F.col('salary'))))
df_output = df.select('name','state','float_value_salary')
The output dataframe would like the following result
+--------+-----+------------------+
| name|state|float_value_salary|
+--------+-----+------------------+
| Smith| OH| 55.5|
| Anna| NY| 33.3|
|Williams| OH| 939.3|
+--------+-----+------------------+

pyspark dataframe change column with two arrays into columns

I've been searching around and haven't figured out a way to restructure a dataframe's column to add new columns to the dataframe based on the array contents dynamically. I'm new to python, so I might be searching on the wrong terms and be the reason I haven't found a clear example yet. Please let me know if this is a duplicate and reference link to find it. I think I just need to be pointed in the right direction.
Ok, the details.
The environment is pyspark 2.3.2 and python 2.7
The sample column contains 2 arrays, which they are correlated to each other 1 to 1. I would like to create a column for each value in the titles array and put the corresponding name (in the person array) the respective column.
I cobbled up an example to focus on my problem with changing the dataframe.
import json
from pyspark.sql.types import ArrayType, StructType, StructField, StringType
from pyspark.sql import functions as f
input = { "sample": { "titles": ["Engineer", "Designer", "Manager"], "person": ["Mary", "Charlie", "Mac"] }, "location": "loc a"},{ "sample": { "titles": ["Engineer", "Owner"],
"person": ["Tom", "Sue"] }, "location": "loc b"},{ "sample": { "titles": ["Engineer", "Designer"], "person": ["Jane", "Bill"] }, "location": "loc a"}
a = [json.dumps(input)]
jsonRDD = sc.parallelize(a)
df = spark.read.json(jsonRDD)
This is the schema of my dataframe:
In [4]: df.printSchema()
root
|-- location: string (nullable = true)
|-- sample: struct (nullable = true)
| |-- person: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- titles: array (nullable = true)
| | |-- element: string (containsNull = true)
My dataframe data:
In [5]: df.show(truncate=False)
+--------+-----------------------------------------------------+
|location|sample |
+--------+-----------------------------------------------------+
|loc a |[[Mary, Charlie, Mac], [Engineer, Designer, Manager]]|
|loc b |[[Sue, Tom], [Owner, Engineer]] |
|loc a |[[Jane, Bill], [Engineer, Designer]] |
+--------+-----------------------------------------------------+
And what I would like my dataframe to look like:
+--------+-----------------------------------------------------+------------+-----------+---------+---------+
|location|sample |Engineer |Desginer |Manager | Owner |
+--------+-----------------------------------------------------+------------+-----------+---------+---------+
|loc a |[[Mary, Charlie, Mac], [Engineer, Designer, Manager]]|Mary |Charlie |Mac | |
|loc b |[[Sue, Tom], [Owner, Engineer]] |Tom | | |Sue |
|loc a |[[Jane, Bill], [Engineer, Designer]] |Jane |Bill | | |
+--------+-----------------------------------------------------+------------+-----------+---------+---------+
I've tried to use the explode function, only to end up with more records with the array field in each record. There have been some examples in stackoverflow, but they have static column names. This dataset can have them in any order and new titles can be added later.
Without explode
First convert each struct to a map:
from pyspark.sql.functions import udf
#udf("map<string,string>")
def as_dict(x):
return dict(zip(*x)) if x else None
dfmap = df.withColumn("sample", as_dict("sample")
Then use method shown in PySpark converting a column of type 'map' to multiple columns in a dataframe to split map into columns
With explode
Add unique id using monotonically_increasing_id.
Use one of the methods show in Pyspark: Split multiple array columns into rows to explode both arrays together or explode the map created with the first method.
pivot the result, grouping by added id and other fields you want to preserve, pivot by title and taking first(person)
#user10601094 helped me get this question answered. I'm posting the full solution below to help anyone else that might have a similar question
I'm not very fluent in python, so please feel free to suggest better approaches
In [1]: import json
...: from pyspark.sql import functions as f
...:
In [2]: # define a sample data set
...: input = { "sample": { "titles": ["Engineer", "Designer", "Manager"], "person": ["Mary", "Charlie", "Mac"] }, "location": "loc a"},{ "sample": { "titles": ["Engineer", "Owner"],
...: "person": ["Tom", "Sue"] }, "location": "loc b"},{ "sample": { "titles": ["Engineer", "Designer"], "person": ["Jane", "Bill"] }, "location": "loc a"}
In [3]: # create a dataframe with the sample json data
...: a = [json.dumps(input)]
...: jsonRDD = sc.parallelize(a)
...: df = spark.read.json(jsonRDD)
...:
2018-11-03 20:48:09 WARN ObjectStore:568 - Failed to get database global_temp, returning NoSuchObjectException
In [4]: # Change the array in the sample column to a dictionary
...: # swap the columns so the titles are the key
...:
...: # UDF to convert 2 arrays into a map
...: #f.udf("map<string,string>")
...: def as_dict(x):
...: return dict(zip(x[1],x[0])) if x else None
...:
In [5]: # create a new dataframe based on the original dataframe
...: dfmap = df.withColumn("sample", as_dict("sample"))
In [6]: # Convert sample column to be title columns based on the map
...:
...: # get the columns names, stored in the keys
...: keys = (dfmap
...: .select(f.explode("sample"))
...: .select("key")
...: .distinct()
...: .rdd.flatMap(lambda x: x)
...: .collect())
In [7]: # create a list of column names
...: exprs = [f.col("sample").getItem(k).alias(k) for k in keys]
...:
In [8]: dfmap.select(dfmap.location, *exprs).show()
+--------+--------+--------+-------+-----+
|location|Designer|Engineer|Manager|Owner|
+--------+--------+--------+-------+-----+
| loc a| Charlie| Mary| Mac| null|
| loc b| null| Tom| null| Sue|
| loc a| Bill| Jane| null| null|
+--------+--------+--------+-------+-----+

unzip list of tuples in pyspark dataframe

I want unzip list of tuples in a column of a pyspark dataframe
Let's say a column as [(blue, 0.5), (red, 0.1), (green, 0.7)], I want to split into two columns, with first column as [blue, red, green] and second column as [0.5, 0.1, 0.7]
+-----+-------------------------------------------+
|Topic| Tokens |
+-----+-------------------------------------------+
| 1| ('blue', 0.5),('red', 0.1),('green', 0.7)|
| 2| ('red', 0.9),('cyan', 0.5),('white', 0.4)|
+-----+-------------------------------------------+
which can be created with this code:
df = sqlCtx.createDataFrame(
[
(1, ('blue', 0.5),('red', 0.1),('green', 0.7)),
(2, ('red', 0.9),('cyan', 0.5),('white', 0.4))
],
('Topic', 'Tokens')
)
And, the output should look like:
+-----+--------------------------+-----------------+
|Topic| Tokens | Weights |
+-----+--------------------------+-----------------+
| 1| ['blue', 'red', 'green']| [0.5, 0.1, 0.7] |
| 2| ['red', 'cyan', 'white']| [0.9, 0.5, 0.4] |
+-----+--------------------------------------------+
If schema of your DataFrame looks like this:
root
|-- Topic: long (nullable = true)
|-- Tokens: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- _1: string (nullable = true)
| | |-- _2: double (nullable = true)
then you can select:
from pyspark.sql.functions import col
df.select(
col("Topic"),
col("Tokens._1").alias("Tokens"), col("Tokens._2").alias("weights")
).show()
# +-----+------------------+---------------+
# |Topic| Tokens| weights|
# +-----+------------------+---------------+
# | 1|[blue, red, green]|[0.5, 0.1, 0.7]|
# | 2|[red, cyan, white]|[0.9, 0.5, 0.4]|
# +-----+------------------+---------------+
And generalized:
cols = [
col("Tokens.{}".format(n)) for n in
df.schema["Tokens"].dataType.elementType.names]
df.select("Topic", *cols)
Reference Querying Spark SQL DataFrame with complex types
You can achieve this with simple indexing using udf():
from pyspark.sql.functions import udf, col
# create the dataframe
df = sqlCtx.createDataFrame(
[
(1, [('blue', 0.5),('red', 0.1),('green', 0.7)]),
(2, [('red', 0.9),('cyan', 0.5),('white', 0.4)])
],
('Topic', 'Tokens')
)
def get_colors(l):
return [x[0] for x in l]
def get_weights(l):
return [x[1] for x in l]
# make udfs from the above functions - Note the return types
get_colors_udf = udf(get_colors, ArrayType(StringType()))
get_weights_udf = udf(get_weights, ArrayType(FloatType()))
# use withColumn and apply the udfs
df.withColumn('Weights', get_weights_udf(col('Tokens')))\
.withColumn('Tokens', get_colors_udf(col('Tokens')))\
.select(['Topic', 'Tokens', 'Weights'])\
.show()
Output:
+-----+------------------+---------------+
|Topic| Tokens| Weights|
+-----+------------------+---------------+
| 1|[blue, red, green]|[0.5, 0.1, 0.7]|
| 2|[red, cyan, white]|[0.9, 0.5, 0.4]|
+-----+------------------+---------------+

ValueError: Found array with dim 3. Estimator expected <= 2

I'am trying to generate my own training data for recognition problem. I have two folder s0 and s1 and the folder containing is data.
images, lables are the two list in which the labels contains the names of the folder.
|—- data
| |—- s0
| | |—- 1.pgm
| | |—- 2.pgm
| | |—- 3.pgm
| | |—- 4.pgm
| | |—- ...
| |—- s1
| | |—- 1.pgm
| | |—- 2.pgm
| | |—- 3.pgm
| | |—- 4.pgm
| | |—- ...
Below is the code, it's showing me an error on line classifier.fit(images, lables)
Traceback (most recent call last):
File "mint.py", line 34, in <module>
classifier.fit(images, lables)
File "/usr/local/lib/python2.7/dist-packages/sklearn/svm/base.py", line 150, in fit
X = check_array(X, accept_sparse='csr', dtype=np.float64, order='C')
File "/usr/local/lib/python2.7/dist- packages/sklearn/utils/validation.py", line 396, in check_array
% (array.ndim, estimator_name))
ValueError: Found array with dim 3. Estimator expected <= 2.
here
import os,sys
import cv2
import numpy as np
from sklearn.svm import SVC
fn_dir ='/home/aquib/Desktop/Natural/data'
# Create a list of images and a list of corresponding names
(images, lables, names, id) = ([], [], {}, 0)
for (subdirs, dirs, files) in os.walk(fn_dir):
for subdir in dirs:
names[id] = subdir
mypath = os.path.join(fn_dir, subdir)
for item in os.listdir(mypath):
if '.png' in item:
label=id
image = cv2.imread(os.path.join(mypath, item),0)
r_image = np.resize(image,(30,30))
if image is not None:
images.append(r_image)
lables.append(int(label))
id += 1
#Create a Numpy array from the two lists above
(images, lables) = [np.array(lis) for lis in [images, lables]]
classifier = SVC(verbose=0, kernel='poly', degree=3)
classifier.fit(images, lables)
I really don't understand how to correct it in 2 dimension.
I am trying the below codes but the error is same:
images = np.array(images)
im_sq = np.squeeze(images).shape
images = images.reshape(images.shape[:2])
There is syntax error on images.append(cv2.imread((path, 0)) last line in your code. The parenthesis are not closed properly. So it should be like this images.append(cv2.imread((path, 0))) . Also it is always a good thing to post the traceback for the error so that it will be easy for anyone to answer.

Cannot update table , when comparing data between two cursors

I wanted to compare rows of a table to find out if they are equal or not ,
what i did was create 2 cursors
1. Select links from table where visted = yes
2. Select links from table where visted = No
Using for loop and if statement i want to compare visited links with not visited links if they are equal or not and if they are equal then update visted of that link to "YES"
Not done yet (My aim was to exit the program if all links are visted and all marked YES or the cursor for " where visited=no " returns a null value)
My portion code:
import sys
import MySQLdb
import urllib
import urlparse
import re
import HTMLParser
from HTMLParser import HTMLParseError
from bs4 import BeautifulSoup
mydb = MySQLdb.connect(host='localhost',
user='root',
passwd='shailang',
db='mydb')
cursor = mydb.cursor()
def process2(url):
flag=0
cursor.execute("SELECT links FROM DATA_urls where visited = 'Ye'")
Yes_rows = cursor.fetchall()
cursor.execute("SELECT links FROM DATA_urls where visited = 'No'")
No_rows = cursor.fetchall()
for No_links in No_rows:
print 'NOOOOOOOOOO'
k= No_links
print k
for Yes_links in Yes_rows:
print "YESSSSSSSSSSSSSS"
k1 = Yes_links
print k1
if k1 == k :
print 'EQUALS'
cursor.execute("UPDATE DATA_urls SET visited = 'Ye' where links = %s",k)
mydb.commit()
def process(url):
proxies = {"http":"http://proxy4.nehu.ac.in:3128",
"https":"https://proxy4.nehu.ac.in:3128"}
page = urllib.urlopen(url,proxies=None)
text = page.read()
page.close()
soup = BeautifulSoup(text)
file=open('s.txt','w')
cursor.execute("INSERT INTO DATA_urls(links,parent,visited) VALUES(%s,'NULL','Ye')",url)
for tag in soup.findAll('a', href=True):
tag['href'] = urlparse.urljoin(url, tag['href'])
print tag['href']
if re.match(ur'(?i)\b((?:https?:\/\/|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))',tag['href']):
cursor.execute("INSERT INTO DATA_urls(links,parent,visited) VALUES(%s,%s,'No')", (tag['href'],url))
file.write('\n')
file.write(tag['href'])
#file.close()
# cursor.execute("SELECT * FROM url")
# rows = cursor.fetchall()
mydb.commit()
process2(1)
def main():
if len(sys.argv) == 1:
print 'No url !!'
sys.exit(1)
for url in sys.argv[1:]:
process(url)
main()
I got no error , but nothing is updated in my database
My table DESC:
+---------+---------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+---------+---------------+------+-----+---------+-------+
| links | varchar(1000) | YES | | NULL | |
| parent | varchar(1000) | YES | | NULL | |
| visited | varchar(2) | YES | | NULL | |
+---------+---------------+------+-----+---------+-------+
change it to
mydb = MySQLdb.connect(host='localhost',
user='root',
passwd='shailang',
db='mydb')
cursor = mydb.cursor()
def process2(url):
flag=0
cursor.execute("SELECT links FROM DATA_urls where visited = Ye")
Yes_rows = cursor.fetchall()
cursor.execute("SELECT links FROM DATA_urls where visited = No")
No_rows = cursor.fetchall()
count = len(No_rows)
for i in range(0, count):
print 'NOOOOOOOOOO'
k= No_links
print k
for j in range (i+1, count):
print "YESSSSSSSSSSSSSS"
k1 = Yes_links
print k1
if k1 == k :
print 'EQUALS'
cursor.execute("UPDATE DATA_urls SET visited =
'Ye' where links = %s",k)
help