I have an athena table with partition based on date like this:
20190218
I want to delete all the partitions that are created last year.
I tried the below query, but it didnt work.
ALTER TABLE tblname DROP PARTITION (partition1 < '20181231');
ALTER TABLE tblname DROP PARTITION (partition1 > '20181010'), Partition (partition1 < '20181231');
According to https://docs.aws.amazon.com/athena/latest/ug/alter-table-drop-partition.html, ALTER TABLE tblname DROP PARTITION takes a partition spec, so no ranges are allowed.
In Presto you would do DELETE FROM tblname WHERE ..., but DELETE is not supported by Athena either.
For these reasons, you need to do leverage some external solution.
For example:
list the files as in https://stackoverflow.com/a/48824373/65458
delete the files and containing directories
update partitions information (https://docs.aws.amazon.com/athena/latest/ug/msck-repair-table.html should be helpful)
While the Athena SQL may not support it at this time, the Glue API call GetPartitions (that Athena uses under the hood for queries) supports complex filter expressions similar to what you can write in a SQL WHERE expression.
Instead of deleting partitions through Athena you can do GetPartitions followed by BatchDeletePartition using the Glue API.
this is the script the does what Theo recommended.
import json
import logging
import awswrangler as wr
import boto3
from botocore.exceptions import ClientError
logging.basicConfig(level=logging.INFO, format=logging.BASIC_FORMAT)
logger = logging.getLogger()
def delete_partitions(database_name: str, table_name: str):
client = boto3.client('glue')
paginator = client.get_paginator('get_partitions')
page_count = 0
partition_count = 0
for page in paginator.paginate(DatabaseName=database_name, TableName=table_name, MaxResults=20):
page_count = page_count + 1
partitions = page['Partitions']
partitions_to_delete = []
for partition in partitions:
partition_count = partition_count + 1
partitions_to_delete.append({'Values': partition['Values']})
logger.info(f"Found partition {partition['Values']}")
if partitions_to_delete:
response = client.batch_delete_partition(DatabaseName=database_name, TableName=table_name,
PartitionsToDelete=partitions_to_delete)
logger.info(f'Deleted partitions with response: {response}')
else:
logger.info('Done with all partitions')
def repair_table(database_name: str, table_name: str):
client = boto3.client('athena')
try:
response = client.start_query_execution(QueryString='MSCK REPAIR TABLE ' + table_name + ';',
QueryExecutionContext={'Database': database_name}, )
except ClientError as err:
logger.info(err.response['Error']['Message'])
else:
res = wr.athena.wait_query(query_execution_id=response['QueryExecutionId'])
logger.info(f"Query succeeded: {json.dumps(res, indent=2)}")
if __name__ == '__main__':
table = 'table_name'
database = 'database_name'
delete_partitions(database_name=database, table_name=table)
repair_table(database_name=database, table_name=table)
Posting the Glue API workaround for Java to save some time for these who need it:
public void deleteMetadataTablePartition(String catalog,
String db,
String table,
String expression) {
GetPartitionsRequest getPartitionsRequest = new GetPartitionsRequest()
.withCatalogId(catalog)
.withDatabaseName(db)
.withTableName(table)
.withExpression(expression);
List<PartitionValueList> partitionsToDelete = new ArrayList<>();
do {
GetPartitionsResult getPartitionsResult = this.glue.getPartitions(getPartitionsRequest);
List<PartitionValueList> partitionsValues = getPartitionsResult.getPartitions()
.parallelStream()
.map(p -> new PartitionValueList().withValues(p.getValues()))
.collect(Collectors.toList());
partitionsToDelete.addAll(partitionsValues);
getPartitionsRequest.setNextToken(getPartitionsResult.getNextToken());
} while (getPartitionsRequest.getNextToken() != null);
Lists.partition(partitionsToDelete, 25)
.parallelStream()
.forEach(partitionValueList -> {
glue.batchDeletePartition(
new BatchDeletePartitionRequest()
.withCatalogId(catalog)
.withDatabaseName(db)
.withTableName(table)
.withPartitionsToDelete(partitionValueList));
});
}
Related
I am trying to perform a batch write item for a dynamodb table using boto3 python library. The table has both hash and range key. When I performed the same with another table with only hash key it worked well. I am wondering how to add both hash and range key when performing batch write item operation.
import boto3
from boto3.dynamodb.conditions import Attr,Key
dynamodb = boto3.resource('dynamodb', 'us-east-2')
table = dynamodb.Table('edc_test')
scan = table.scan(
#ProjectionExpression='#k',
ProjectionExpression='resource_id',
#ProjectionExpression='version_id',
FilterExpression=Attr('Health.New version - Veracity unavailable').eq("A new dataset is available but IDQ rules are not generated yet")
)
items=scan['Items']
print('length',str(len(items)))
print(items)
def lambda_handler(event, context):
with table.batch_writer() as batch:
for each in scan['Items']:
batch.delete_item(Key=each)
ProjectionExpression='version_id,resource_id',
FilterExpression=Attr('Health.New version - Veracity unavailable').eq("A new dataset is available but IDQ rules are not generated yet")
#ExpressionAttributeNames={
# '#k': 'name'
#}
)
items=scan['Items']
print('length',str(len(items)))
print(items)
#response = table.table.delete_item(Key={resource_id:1})
with table.batch_writer() as batch:
#for each in scan['Items']:
# batch.delete_item(Key=each)
for each in scan['Items']:
#batch.delete_item(Key={'version_id': each['version_id']})
batch.delete_item(Key={'resource_id': each['resource_id'], 'version_id': each['version_id']})
Included sort key in scan projection expression and included the same in delete batch item , it worked.
I have a self authored Glue script and a JDBC Connection stored in the Glue catalog. I cannot figure out how to use PySpark to do a select statement from the MySQL database stored in RDS that my JDBC Connection points to. I have also used a Glue Crawler to infer the schema of the RDS table that I am interested in querying. How do I query the RDS database using a WHERE clause?
I have looked through the documentation for DynamicFrameReader and the GlueContext Class but neither seem to point me in the direction that I am seeking.
It depends on what you want to do. For example, if you want to do a select * from table where <conditions>, there are two options:
Assuming you created a crawler and inserted the source on your AWS Glue job like this:
# Read data from database
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "db", table_name = "students", redshift_tmp_dir = args["TempDir"])
AWS Glue
# Select the needed fields
selectfields1 = SelectFields.apply(frame = datasource0, paths = ["user_id", "full_name", "is_active", "org_id", "org_name", "institution_id", "department_id"], transformation_ctx = "selectfields1")
filter2 = Filter.apply(frame = selectfields1, f = lambda x: x["org_id"] in org_ids, transformation_ctx="filter2")
PySpark + AWS Glue
# Change DynamicFrame to Spark DataFrame
dataframe = DynamicFrame.toDF(datasource0)
# Create a view
dataframe.createOrReplaceTempView("students")
# Use SparkSQL to select the fields
dataframe_sql_df_dim = spark.sql("SELECT user_id, full_name, is_active, org_id, org_name, institution_id, department_id FROM assignments WHERE org_id in (" + org_ids + ")")
# Change back to DynamicFrame
selectfields = DynamicFrame.fromDF(dataframe_sql_df_dim, glueContext, "selectfields2")
I understand that there is no direct UPSERT query one can perform directly from Glue to Redshift. Is it possible to implement the staging table concept within the glue script itself?
So my expectation is creating the staging table, merging it with destination table and finally deleting it. Can it be achieved within the Glue script?
It is possible to implement upsert into Redshift using staging table in Glue by passing 'postactions' option to JDBC sink:
val destinationTable = "upsert_test"
val destination = s"dev_sandbox.${destinationTable}"
val staging = s"dev_sandbox.${destinationTable}_staging"
val fields = datasetDf.toDF().columns.mkString(",")
val postActions =
s"""
DELETE FROM $destination USING $staging AS S
WHERE $destinationTable.id = S.id
AND $destinationTable.date = S.date;
INSERT INTO $destination ($fields) SELECT $fields FROM $staging;
DROP TABLE IF EXISTS $staging
"""
// Write data to staging table in Redshift
glueContext.getJDBCSink(
catalogConnection = "redshift-glue-connections-test",
options = JsonOptions(Map(
"database" -> "conndb",
"dbtable" -> staging,
"overwrite" -> "true",
"postactions" -> postActions
)),
redshiftTmpDir = s"$tempDir/redshift",
transformationContext = "redshift-output"
).writeDynamicFrame(datasetDf)
Make sure the user used for writing to Redshift has sufficient permissions to create/drop tables in the staging schema.
Apparently connection_options dictionary parameter in glueContext.write_dynamic_frame.from_jdbc_conf function has 2 interesting parameters: preactions and postactions
target_table = "my_schema.my_table"
stage_table = "my_schema.#my_table_stage_table"
pre_query = """
drop table if exists {stage_table};
create table {stage_table} as select * from {target_table} LIMIT 0;""".format(stage_table=stage_table, target_table=target_table)
post_query = """
begin;
delete from {target_table} using {stage_table} where {stage_table}.id = {target_table}.id ;
insert into {target_table} select * from {stage_table};
drop table {stage_table};
end;""".format(stage_table=stage_table, target_table=target_table)
datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
frame = datasource0, catalog_connection ="test_red", redshift_tmp_dir='s3://s3path', transformation_ctx="datasink4",
connection_options = {"preactions": pre_query, "postactions": post_query,
"dbtable": stage_table, "database": "redshiftdb"})
Based on https://aws.amazon.com/premiumsupport/knowledge-center/sql-commands-redshift-glue-job/
Yes, it can be totally achievable. All you would need is to import pg8000 module into your glue job. pg8000 module is the python library which is used to make connection with Amazon Redshift and execute SQL queries through cursor.
Python Module Reference: https://github.com/mfenniak/pg8000
Then, make connection to your target cluster through pg8000.connect(user='user',database='dbname',host='hosturl',port=5439,password='urpasswrd')
And use the Glue,s datasink option to load into staging table and then run upsert sql query using pg8000 cursor
>>> import pg8000
>>> conn = pg8000.connect(user='user',database='dbname',host='hosturl',port=5439,password='urpasswrd')
>>> cursor = conn.cursor()
>>> cursor.execute("CREATE TEMPORARY TABLE book (id SERIAL, title TEXT)")
>>> cursor.execute("INSERT INTO TABLE final_target"))
>>> conn.commit()
You would need to zip the pg8000 package and put it in s3 bucket and reference it to the Python Libraries path under the Advanced options/Job parameters at Glue Job section.
I am using BigQuery Python API to create table, and would like to set an expiration date to the table, so the table would be automatically dropped after certain days.
Here is my code:
client = bq.Client()
job_config = bq.QueryJobConfig()
dataset_id = dataset
table_ref = client.dataset(dataset_id).table(filename)
job_config.destination = table_ref
job_config.write_disposition = 'WRITE_TRUNCATE'
dt = datetime.now() + timedelta(seconds=259200)
unixtime = (dt - datetime(1970,1,1)).total_seconds()
expiration_time = unixtime
job_config.expires = expiration_time
query_job = client.query(query, job_config=job_config)
query_job.result()
The problem is that the expiration parameter doesn't seem to work. When I am checking the table detail in the UI, the expiration date is still Never.
To answer a slightly different question, instead of specifying the expiration as part of the request options, you can use a CREATE TABLE statement instead, where the relevant option is expiration_timestamp. For example:
CREATE OR REPLACE TABLE my_dataset.MyTable
(
x INT64,
y FLOAT64
)
OPTIONS (
expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 3 DAY)
);
This creates a table with two columns that will expire three days from now. CREATE TABLE supports an optional AS SELECT clause, too, if you want to create the table from the result of a query (the documentation goes into more detail).
To update an existing table expiration time with Python:
import datetime
from google.cloud import bigquery
client = bigquery.Client()
table = client.get_table("project.dataset.table")
table.expires = datetime.datetime.now() + datetime.timedelta(days=1)
client.update_table(table, ['expires'])
Credits: /u/ApproximateIdentity
Looking at the docs for the query method we can see that it's not possible to set an expiration time in the query job config.
The proper way of doing so is setting at the Table resource, something like:
client = bq.Client()
job_config = bq.QueryJobConfig()
dataset_id = dataset
table_ref = client.dataset(dataset_id).table(filename)
table = bq.Table(table_ref)
dt = datetime.now() + timedelta(seconds=259200)
table.expires = dt
client.create_table(table)
query_job = client.query(query, job_config=job_config)
query_job.result()
My Athena queries appear to be too short in their results. Trying to figure out Why?
Setup:
Glue Catalogs (118.6 Gig in size).
Data: Stored in S3 in both CSV and JSON format.
Athena Query: When I query data for a whole table, I only get 40K results per Query, there should be 121Million Records for that query on average for one month's data.
Does Athena Cap query result data? Is this a service limit (the documentation does not suggest this to be the case).
So, getting 1000 results at a time obviously doesn't scale. Thankfully, there's a simple workaround. (Or maybe this is how it was supposed to be done all along.)
When you run an Athena query, you should get a QueryExecutionId. This Id corresponds to the output file you'll find in S3.
Here's a snippet I wrote:
s3 = boto3.resource("s3")
athena = boto3.client("athena")
response: Dict = athena.start_query_execution(QueryString=query, WorkGroup="<your_work_group>")
execution_id: str = response["QueryExecutionId"]
print(execution_id)
# Wait until the query is finished
while True:
try:
athena.get_query_results(QueryExecutionId=execution_id)
break
except botocore.exceptions.ClientError as e:
time.sleep(5)
local_filename: str = "temp/athena_query_result_temp.csv"
s3.Bucket("athena-query-output").download_file(execution_id + ".csv", local_filename)
return pd.read_csv(local_filename)
Make sure the corresponding WorkGroup has "Query result location" set, e.g. "s3://athena-query-output/"
Also see this thread with similar answers: How to Create Dataframe from AWS Athena using Boto3 get_query_results method
It seems that there is a limit of 1000.
You should use NextToken to iterate over the results.
Quote of the GetQueryResults Documentation
MaxResults The maximum number of results (rows) to return in this
request.
Type: Integer
Valid Range: Minimum value of 0. Maximum value of 1000.
Required: No
Another option is Paginate and count approach :
Don't know whether better way to do it like select count(*) from table like...
Here is the complete example code ready to use. Used python boto3 athena api
I used paginator and converted result as list of dict and also returning count along with the result.
below are 2 methods
First one will paginate
second one will convert paginated result to list of dict and calculate count.
Note : converting in to list of dict is not necessary in this case. If you don't want that.. in the code you can modify to have only count
def get_athena_results_paginator(params, athena_client):
"""
:param params:
:param athena_client:
:return:
"""
query_id = athena_client.start_query_execution(
QueryString=params['query'],
QueryExecutionContext={
'Database': params['database']
}
# ,
# ResultConfiguration={
# 'OutputLocation': 's3://' + params['bucket'] + '/' + params['path']
# }
, WorkGroup=params['workgroup']
)['QueryExecutionId']
query_status = None
while query_status == 'QUEUED' or query_status == 'RUNNING' or query_status is None:
query_status = athena_client.get_query_execution(QueryExecutionId=query_id)['QueryExecution']['Status']['State']
if query_status == 'FAILED' or query_status == 'CANCELLED':
raise Exception('Athena query with the string "{}" failed or was cancelled'.format(params.get('query')))
time.sleep(10)
results_paginator = athena_client.get_paginator('get_query_results')
results_iter = results_paginator.paginate(
QueryExecutionId=query_id,
PaginationConfig={
'PageSize': 1000
}
)
count, results = result_to_list_of_dict(results_iter)
return results, count
def result_to_list_of_dict(results_iter):
"""
:param results_iter:
:return:
"""
results = []
column_names = None
count = 0
for results_page in results_iter:
print(len(list(results_iter)))
for row in results_page['ResultSet']['Rows']:
count = count + 1
column_values = [col.get('VarCharValue', None) for col in row['Data']]
if not column_names:
column_names = column_values
else:
results.append(dict(zip(column_names, column_values)))
return count, results