Join 2 Dataframes with Regex in where clause pyspark - regex

We have two dataframes
df = spark.createDataFrame([
(1, 'Nick', 'Miller'),
(2, 'Jessica', 'Day'),
(3, 'Winston', 'Schmidt'),
], ['id', 'First_name', 'Last_name'])
df1 = spark.createDataFrame([ (1, '^[a-lA-L]', 'type1'), (3, '^[m-zM-Z]', 'type2')], ['id', 'regex_match', 'vaule']
Need to join these two dataframe, where df1.regex_match matches with df.Last_name
Needed output be like following: Any suggestion please:
join df to df1 using left join

You can join using an rlike condition:
import pyspark.sql.functions as F
result = df.alias('df').join(
df1.drop('id').alias('df1'),
F.expr('df.Last_name rlike df1.regex_match'),
'left'
).drop('regex_match')
result.show()
+---+----------+---------+-----+
| id|First_name|Last_name|vaule|
+---+----------+---------+-----+
| 1| Nick| Miller|type2|
| 2| Jessica| Day|type1|
| 3| Winston| Schmidt|type2|
+---+----------+---------+-----+

Related

Django ORM: calculation only inside databse-query possible?

I have rather simple dataset that's containing the following data:
id | aqi | date | state_name
1 | 17 | 2020-01-01 | California
2 | 54 | 2020-01-02 | California
3 | 37 | 2020-01-03 | California
4 | 29 | 2020-01-04 | California
What I'm trying to achieve is the average aqi (air-quality-index) from april 2022 minus the average aqi from april 2021, without using multiple queries. Is this even possible or should I use two queries and compare them manually?
From my understanding, I should use the Q-Expression to filter the correct dates, correct?
AirQuality.objects.filter(Q(date__range=['2021-04-01', '2021-04-30']) & Q('2022-04-01', '2022-04-30'))
The best solution I came across myself so far is:
qs_apr20 = (
AirQuality.objects
.aggregate(apr20=Avg('aqi', filter=Q(date__range=(datetime.date(2020, 4, 1), datetime.date(2020, 4, 30)))))['apr20']
)
qs_apr21 = (
AirQuality.objects
.aggregate(apr21=Avg('aqi', filter=Q(date__range=(datetime.date(2021, 4, 1), datetime.date(2021, 4, 30)))))['apr21']
)
result = round(qs_apr21 - qs_apr20, 2)
Thanks for your help and have a great day!
Inspiring from the documentation, the following should work:
>>> import datetime
>>> from django.db.models import Q, Avg
>>> from django.db.models import F
>>> apr21 = Avg('aqi', filter=Q(date__range=(datetime.date(2021, 4, 1), datetime.date(2021, 4, 30)))
>>> apr22 = Avg('aqi', filter=Q(date__range=(datetime.date(2022, 4, 1), datetime.date(2022, 4, 30)))
>>> aqi_calc = Airquality.objects.annotate(apr21=apr21)
.annotate(apr22=apr22)
.annotate(diff=F('apr22') - F('apr21'))
It should do everything in 1 query, if I'm not mistaken.

Join list column with string column in PySpark

I have two data frames like df_emp and df_dept:
df_emp:
id Name
1 aaa
2 bbb
3 ccc
4 ddd
df_dept:
dept_id dept_name employees
1 DE [1, 2]
2 DA [3, 4]
The expected result after joining:
dept_name employees employee_names
DE [1, 2] [aaa, bbb]
DA [3, 4] [ccc, ddd]
Any idea how to do it using simple joins or udf's?
It can be done without UDF. First explode the array, then join and group.
Input data:
from pyspark.sql import functions as F
df_emp = spark.createDataFrame(
[(1, 'aaa'),
(2, 'bbb'),
(3, 'ccc'),
(4, 'ddd')],
['id', 'Name']
)
df_dept = spark.createDataFrame(
[(1, 'DE', [1, 2]),
(2, 'DA', [3, 4])],
['dept_id', 'dept_name', 'employees']
)
Script:
df_dept_exploded = df_dept.withColumn('id', F.explode('employees'))
df_joined = df_dept_exploded.join(df_emp, 'id', 'left')
df = (
df_joined
.groupBy('dept_name')
.agg(
F.collect_list('id').alias('employees'),
F.collect_list('Name').alias('employee_names')
)
)
df.show()
# +---------+---------+--------------+
# |dept_name|employees|employee_names|
# +---------+---------+--------------+
# | DE| [1, 2]| [aaa, bbb]|
# | DA| [3, 4]| [ccc, ddd]|
# +---------+---------+--------------+

How to extract values from a column and have it as float in pyspark?

I have a pyspark dataframe that visually looks like the following. I want the column to hold float values only. Please note, currently the values have square bracket around it.
from pyspark.sql.types import StructType,StructField
from pyspark.sql.types import StringType, IntegerType, ArrayType
data = [
("Smith","OH","[55.5]"),
("Anna","NY","[33.3]"),
("Williams","OH","[939.3]"),
]
schema = StructType([
StructField('name', StringType(), True),
StructField('state', StringType(), True),
StructField('salary', StringType(), True)
])
df = spark.createDataFrame(data = data, schema= schema)
df.show(truncate=False)
Input:
+--------+-----+-------+
|name |state|salary |
+--------+-----+-------+
|Smith |OH |[55.5] |
|Anna |NY |[33.3] |
|Williams|OH |[939.3]|
+--------+-----+-------+
And the output should look like,
+--------+-----+------------------+
|name |state|float_value_salary|
+--------+-----+------------------+
|Smith |OH |55.5 |
|Anna |NY |33.3 |
|Williams|OH |939.3 |
+--------+-----+------------------+
Thank you for any help.
You can trim the square brackets and cast to float:
import pyspark.sql.functions as F
df2 = df.withColumn('salary', F.expr("float(trim('[]', salary))"))
df2.show()
+--------+-----+------+
| name|state|salary|
+--------+-----+------+
| Smith| OH| 55.5|
| Anna| NY| 33.3|
|Williams| OH| 939.3|
+--------+-----+------+
Or you can use from_json to parse it as an array of float, and get the first array element:
df2 = df.withColumn('salary', F.from_json('salary', 'array<float>')[0])
You can use regex:
import pyspark.sql.functions as F
df.select(
F.regexp_extract('salary', '([\d\.]+)', 1).cast('float').alias('salary')
).show()
Output:
+------+
|salary|
+------+
| 55.5|
| 33.3|
| 939.3|
+------+
you need to parse the string to a float array using a UDF and then you can explode the array to get the singular value within the array.
the program would be as follows :
import json
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType
def parse_value_from_string(x):
res = json.loads(x)
return res
parse_float_array = F.udf(parse_value_from_string, ArrayType(FloatType()))
df = df.withColumn('float_value_salary',F.explode(parse_float_array(F.col('salary'))))
df_output = df.select('name','state','float_value_salary')
The output dataframe would like the following result
+--------+-----+------------------+
| name|state|float_value_salary|
+--------+-----+------------------+
| Smith| OH| 55.5|
| Anna| NY| 33.3|
|Williams| OH| 939.3|
+--------+-----+------------------+

pyspark dataframe change column with two arrays into columns

I've been searching around and haven't figured out a way to restructure a dataframe's column to add new columns to the dataframe based on the array contents dynamically. I'm new to python, so I might be searching on the wrong terms and be the reason I haven't found a clear example yet. Please let me know if this is a duplicate and reference link to find it. I think I just need to be pointed in the right direction.
Ok, the details.
The environment is pyspark 2.3.2 and python 2.7
The sample column contains 2 arrays, which they are correlated to each other 1 to 1. I would like to create a column for each value in the titles array and put the corresponding name (in the person array) the respective column.
I cobbled up an example to focus on my problem with changing the dataframe.
import json
from pyspark.sql.types import ArrayType, StructType, StructField, StringType
from pyspark.sql import functions as f
input = { "sample": { "titles": ["Engineer", "Designer", "Manager"], "person": ["Mary", "Charlie", "Mac"] }, "location": "loc a"},{ "sample": { "titles": ["Engineer", "Owner"],
"person": ["Tom", "Sue"] }, "location": "loc b"},{ "sample": { "titles": ["Engineer", "Designer"], "person": ["Jane", "Bill"] }, "location": "loc a"}
a = [json.dumps(input)]
jsonRDD = sc.parallelize(a)
df = spark.read.json(jsonRDD)
This is the schema of my dataframe:
In [4]: df.printSchema()
root
|-- location: string (nullable = true)
|-- sample: struct (nullable = true)
| |-- person: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- titles: array (nullable = true)
| | |-- element: string (containsNull = true)
My dataframe data:
In [5]: df.show(truncate=False)
+--------+-----------------------------------------------------+
|location|sample |
+--------+-----------------------------------------------------+
|loc a |[[Mary, Charlie, Mac], [Engineer, Designer, Manager]]|
|loc b |[[Sue, Tom], [Owner, Engineer]] |
|loc a |[[Jane, Bill], [Engineer, Designer]] |
+--------+-----------------------------------------------------+
And what I would like my dataframe to look like:
+--------+-----------------------------------------------------+------------+-----------+---------+---------+
|location|sample |Engineer |Desginer |Manager | Owner |
+--------+-----------------------------------------------------+------------+-----------+---------+---------+
|loc a |[[Mary, Charlie, Mac], [Engineer, Designer, Manager]]|Mary |Charlie |Mac | |
|loc b |[[Sue, Tom], [Owner, Engineer]] |Tom | | |Sue |
|loc a |[[Jane, Bill], [Engineer, Designer]] |Jane |Bill | | |
+--------+-----------------------------------------------------+------------+-----------+---------+---------+
I've tried to use the explode function, only to end up with more records with the array field in each record. There have been some examples in stackoverflow, but they have static column names. This dataset can have them in any order and new titles can be added later.
Without explode
First convert each struct to a map:
from pyspark.sql.functions import udf
#udf("map<string,string>")
def as_dict(x):
return dict(zip(*x)) if x else None
dfmap = df.withColumn("sample", as_dict("sample")
Then use method shown in PySpark converting a column of type 'map' to multiple columns in a dataframe to split map into columns
With explode
Add unique id using monotonically_increasing_id.
Use one of the methods show in Pyspark: Split multiple array columns into rows to explode both arrays together or explode the map created with the first method.
pivot the result, grouping by added id and other fields you want to preserve, pivot by title and taking first(person)
#user10601094 helped me get this question answered. I'm posting the full solution below to help anyone else that might have a similar question
I'm not very fluent in python, so please feel free to suggest better approaches
In [1]: import json
...: from pyspark.sql import functions as f
...:
In [2]: # define a sample data set
...: input = { "sample": { "titles": ["Engineer", "Designer", "Manager"], "person": ["Mary", "Charlie", "Mac"] }, "location": "loc a"},{ "sample": { "titles": ["Engineer", "Owner"],
...: "person": ["Tom", "Sue"] }, "location": "loc b"},{ "sample": { "titles": ["Engineer", "Designer"], "person": ["Jane", "Bill"] }, "location": "loc a"}
In [3]: # create a dataframe with the sample json data
...: a = [json.dumps(input)]
...: jsonRDD = sc.parallelize(a)
...: df = spark.read.json(jsonRDD)
...:
2018-11-03 20:48:09 WARN ObjectStore:568 - Failed to get database global_temp, returning NoSuchObjectException
In [4]: # Change the array in the sample column to a dictionary
...: # swap the columns so the titles are the key
...:
...: # UDF to convert 2 arrays into a map
...: #f.udf("map<string,string>")
...: def as_dict(x):
...: return dict(zip(x[1],x[0])) if x else None
...:
In [5]: # create a new dataframe based on the original dataframe
...: dfmap = df.withColumn("sample", as_dict("sample"))
In [6]: # Convert sample column to be title columns based on the map
...:
...: # get the columns names, stored in the keys
...: keys = (dfmap
...: .select(f.explode("sample"))
...: .select("key")
...: .distinct()
...: .rdd.flatMap(lambda x: x)
...: .collect())
In [7]: # create a list of column names
...: exprs = [f.col("sample").getItem(k).alias(k) for k in keys]
...:
In [8]: dfmap.select(dfmap.location, *exprs).show()
+--------+--------+--------+-------+-----+
|location|Designer|Engineer|Manager|Owner|
+--------+--------+--------+-------+-----+
| loc a| Charlie| Mary| Mac| null|
| loc b| null| Tom| null| Sue|
| loc a| Bill| Jane| null| null|
+--------+--------+--------+-------+-----+

unzip list of tuples in pyspark dataframe

I want unzip list of tuples in a column of a pyspark dataframe
Let's say a column as [(blue, 0.5), (red, 0.1), (green, 0.7)], I want to split into two columns, with first column as [blue, red, green] and second column as [0.5, 0.1, 0.7]
+-----+-------------------------------------------+
|Topic| Tokens |
+-----+-------------------------------------------+
| 1| ('blue', 0.5),('red', 0.1),('green', 0.7)|
| 2| ('red', 0.9),('cyan', 0.5),('white', 0.4)|
+-----+-------------------------------------------+
which can be created with this code:
df = sqlCtx.createDataFrame(
[
(1, ('blue', 0.5),('red', 0.1),('green', 0.7)),
(2, ('red', 0.9),('cyan', 0.5),('white', 0.4))
],
('Topic', 'Tokens')
)
And, the output should look like:
+-----+--------------------------+-----------------+
|Topic| Tokens | Weights |
+-----+--------------------------+-----------------+
| 1| ['blue', 'red', 'green']| [0.5, 0.1, 0.7] |
| 2| ['red', 'cyan', 'white']| [0.9, 0.5, 0.4] |
+-----+--------------------------------------------+
If schema of your DataFrame looks like this:
root
|-- Topic: long (nullable = true)
|-- Tokens: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- _1: string (nullable = true)
| | |-- _2: double (nullable = true)
then you can select:
from pyspark.sql.functions import col
df.select(
col("Topic"),
col("Tokens._1").alias("Tokens"), col("Tokens._2").alias("weights")
).show()
# +-----+------------------+---------------+
# |Topic| Tokens| weights|
# +-----+------------------+---------------+
# | 1|[blue, red, green]|[0.5, 0.1, 0.7]|
# | 2|[red, cyan, white]|[0.9, 0.5, 0.4]|
# +-----+------------------+---------------+
And generalized:
cols = [
col("Tokens.{}".format(n)) for n in
df.schema["Tokens"].dataType.elementType.names]
df.select("Topic", *cols)
Reference Querying Spark SQL DataFrame with complex types
You can achieve this with simple indexing using udf():
from pyspark.sql.functions import udf, col
# create the dataframe
df = sqlCtx.createDataFrame(
[
(1, [('blue', 0.5),('red', 0.1),('green', 0.7)]),
(2, [('red', 0.9),('cyan', 0.5),('white', 0.4)])
],
('Topic', 'Tokens')
)
def get_colors(l):
return [x[0] for x in l]
def get_weights(l):
return [x[1] for x in l]
# make udfs from the above functions - Note the return types
get_colors_udf = udf(get_colors, ArrayType(StringType()))
get_weights_udf = udf(get_weights, ArrayType(FloatType()))
# use withColumn and apply the udfs
df.withColumn('Weights', get_weights_udf(col('Tokens')))\
.withColumn('Tokens', get_colors_udf(col('Tokens')))\
.select(['Topic', 'Tokens', 'Weights'])\
.show()
Output:
+-----+------------------+---------------+
|Topic| Tokens| Weights|
+-----+------------------+---------------+
| 1|[blue, red, green]|[0.5, 0.1, 0.7]|
| 2|[red, cyan, white]|[0.9, 0.5, 0.4]|
+-----+------------------+---------------+