Create Athena table with Json - amazon-web-services

I have done a lab question that requires me to find a string of text from logs which I have done so by the long way (excel) instead of the intelligent way (having data populated nicely on the table in Athena). I have multiple files within an S3 folder that has json format like this...
{"Records":[{"eventVersion":"1.05","userIdentity":{"type":"AssumedRole","principalId":"ARXXXXXXXXXXXXXXXXXFVEC:AWSConfig-Describe","arn":"arn:aws:sts::2CCCCCCCCC8:assumed-role/AWSServiceRoleForConfig/AWSConfig-Describe","accountId":"2CCCCCCCCC8","accessKeyId":"ASXXXXXXXXXXXXXXXQVL","sessionContext":{"sessionIssuer":{"type":"Role","principalId":"ARXXXXXXXXXXXXXXXXXFVEC","arn":"arn:aws:iam::2CCCCCCCCC8:role/aws-service-role/config.amazonaws.com/AWSServiceRoleForConfig","accountId":"2CCCCCCCCC8","userName":"AWSServiceRoleForConfig"},"attributes":{"creationDate":"2019-09-03T07:40:00Z","mfaAuthenticated":"false"}},"invokedBy":"AWS Internal"},"eventTime":"2019-09-03T07:40:00Z","eventSource":"s3.amazonaws.com","eventName":"HeadBucket","awsRegion":"us-west-2","sourceIPAddress":"172.18.87.252","userAgent":"[AWSConfig]","requestParameters":{"bucketName":"service_logs_10l51wolgib72","Host":"s3.us-west-2.amazonaws.com"},"responseElements":null,"additionalEventData":{"SignatureVersion":"SigV4","CipherSuite":"ECDHE-RSA-AES128-SHA","bytesTransferredIn":0.0,"AuthenticationMethod":"AuthHeader","x-amz-id-2":"JYEwSk6jEv2rB/MjwluNXcnKxRSo72GCOz8WP9OYXDFI2FxS1T81K7excoDuo36rJIQz9MWYKEE=","bytesTransferredOut":0.0},"requestID":"E224F90BD7370007","eventID":"77d7ea03-b8a2-4b50-8f81-b8217eacf008","readOnly":true,"resources":[{"type":"AWS::S3::Object","ARNPrefix":"arn:aws:s3:::service_logs_10l51wolgib72/"},{"accountId":"2CCCCCCCCC8","type":"AWS::S3::Bucket","ARN":"arn:aws:s3:::service_logs_10l51wolgib72"}],"eventType":"AwsApiCall","recipientAccountId":"2CCCCCCCCC8","vpcEndpointId":"vpce-3c0ee766"},{"eventVersion":"1.05","userIdentity":{"type":"AssumedRole","principalId":"ARXXXXXXXXXXXXXXXXXFVEC:AWSConfig-Describe","arn":"arn:aws:sts::2CCCCCCCCC8:assumed-role/AWSServiceRoleForConfig/AWSConfig-Describe","accountId":"2CCCCCCCCC8","accessKeyId":"ASXXXXXXXXXXXXXXXQVL","sessionContext":{"sessionIssuer":{"type":"Role","principalId":"ARXXXXXXXXXXXXXXXXXFVEC","arn":"arn:aws:iam::2CCCCCCCCC8:role/aws-service-role/config.amazonaws.com/AWSServiceRoleForConfig","accountId":"2CCCCCCCCC8","userName":"AWSServiceRoleForConfig"},"attributes":{"creationDate":"2019-09-03T07:40:00Z","mfaAuthenticated":"false"}},"invokedBy":"AWS Internal"},"eventTime":"2019-09-03T07:40:00Z","eventSource":"s3.amazonaws.com","eventName":"HeadBucket","awsRegion":"us-west-2","sourceIPAddress":"172.18.87.252","userAgent":"[AWSConfig]","requestParameters":{"bucketName":"service_logs_10l51wolgib72","Host":"s3.us-west-2.amazonaws.com"},"responseElements":null,"additionalEventData":{"SignatureVersion":"SigV4","CipherSuite":"ECDHE-RSA-AES128-SHA","bytesTransferredIn":0.0,"AuthenticationMethod":"AuthHeader","x-amz-id-2":"JYEwSk6jEv2rB/MjwluNXcnKxRSo72GCOz8WP9OYXDFI2FxS1T81K7excoDuo36rJIQz9MWYKEE=","bytesTransferredOut":0.0},"requestID":"E224F90BD7370020","eventID":"77d7ea03-b8a2-4b50-8f81-b8217eacf021","readOnly":true,"resources":[{"type":"AWS::S3::Object","ARNPrefix":"arn:aws:s3:::service_logs_10l51wolgib72/"},{"accountId":"2CCCCCCCCC8","type":"AWS::S3::Bucket","ARN":"arn:aws:s3:::service_logs_10l51wolgib72"}],"eventType":"AwsApiCall","recipientAccountId":"2CCCCCCCCC8","vpcEndpointId":"vpce-3c0ee766"},{"eventVersion":"1.05","userIdentity":{"type":"AssumedRole","principalId":"ARXXXXXXXXXXXXXXXXXFVEC:AWSConfig-Describe","arn":"arn:aws:sts::2CCCCCCCCC8:assumed-role/AWSServiceRoleForConfig/AWSConfig-Describe","accountId":"2CCCCCCCCC8","accessKeyId":"ASXXXXXXXXXXXXXXXQVL","sessionContext":{"sessionIssuer":{"type":"Role","principalId":"ARXXXXXXXXXXXXXXXXXFVEC","arn":"arn:aws:iam::2CCCCCCCCC8:role/aws-service-role/config.amazonaws.com/AWSServiceRoleForConfig","accountId":"2CCCCCCCCC8","userName":"AWSServiceRoleForConfig"},"attributes":{"creationDate":"2019-09-03T07:40:00Z","mfaAuthenticated":"false"}},"invokedBy":"AWS Internal"},"eventTime":"2019-09-03T07:40:00Z","eventSource":"s3.amazonaws.com","eventName":"HeadBucket","awsRegion":"us-west-2","sourceIPAddress":"172.18.87.252","userAgent":"[AWSConfig]","requestParameters":{"bucketName":"service_logs_10l51wolgib72","Host":"s3.us-west-2.amazonaws.com"},"responseElements":null,"additionalEventData":{"SignatureVersion":"SigV4","CipherSuite":"ECDHE-RSA-AES128-SHA","bytesTransferredIn":0.0,"AuthenticationMethod":"AuthHeader","x-amz-id-2":"JYEwSk6jEv2rB/MjwluNXcnKxRSo72GCOz8WP9OYXDFI2FxS1T81K7excoDuo36rJIQz9MWYKEE=","bytesTransferredOut":0.0},"requestID":"E224F90BD7370033","eventID":"77d7ea03-b8a2-4b50-8f81-b8217eacf034","readOnly":true,"resources":[{"type":"AWS::S3::Object","ARNPrefix":"arn:aws:s3:::service_logs_10l51wolgib72/"},{"accountId":"2CCCCCCCCC8","type":"AWS::S3::Bucket","ARN":"arn:aws:s3:::service_logs_10l51wolgib72"}],"eventType":"AwsApiCall","recipientAccountId":"2CCCCCCCCC8","vpcEndpointId":"vpce-3c0ee766"}]}
I am importing them (the whole dir with the files sharing same format) into Athena using this code from online reference but no matter how I adjust, I still get the table with single column...
CREATE EXTERNAL TABLE IF NOT EXISTS `default`.testjson4 (
Records struct<eventVersion:string,
userIdentity:string,
eventTime:string,
eventSource:string>)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
WITH SERDEPROPERTIES (
'ignore.malformed.json' = 'FALSE',
'dots.in.keys' = 'FALSE',
'case.insensitive' = 'TRUE',
'mapping' = 'TRUE'
)
STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://******************/CloudTrail/us-east-1/****/'
TBLPROPERTIES ('classification' = 'json');
Would like to get help on how to do this correctly, thank you!

Related

Creating a table with partition projection results in an empty one

I'm creating a table in Athena based on a list of CSV files in an S3 bucket. The files in the buckets are placed in folders like this:
$ aws s3 ls s3://bucket-name/ --recursive
2023-01-23 16:05:01 25601 logs2023/01/23/23/analytics_Log-1-2023-01-23-23-59-59-6dc5bd4c-f00f-4f34-9292-7bfa9ec33c55
2023-01-23 16:10:03 18182 logs2023/01/24/00/analytics_Log-1-2023-01-24-00-05-01-aa2cb565-05c8-43e2-a203-96324f66a5a7
2023-01-23 16:15:05 20350 logs2023/01/24/00/analytics_Log-1-2023-01-24-00-10-03-87b03989-c059-4fca-8e8b-909e787db889
2023-01-23 16:20:09 25187 logs2023/01/24/00/analytics_Log-1-2023-01-24-00-15-06-6d9b39fb-c05f-4416-9b17-415f48e63591
2023-01-23 16:25:18 20590 logs2023/01/24/00/analytics_Log-1-2023-01-24-00-20-16-3939a0fe-8cfb-4168-bc8e-e71d2122add5
This is the format for the folder structure:
logs{year}/{month}/{day}/{hour}/<filename>
I would like to use Athena's partition projection and this is how I'm creating my table:
CREATE EXTERNAL TABLE analytics.logs (
id string,
...
type tinyint)
PARTITIONED BY (
year bigint COMMENT '',
month string COMMENT '',
day string COMMENT '')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://bucket-name/'
TBLPROPERTIES (
'classification'='csv',
'partition.day.values'='01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31',
'partition.day.type'='enum',
'partition.enable'='true',
'partition.month.values'='01,02,03,04,05,06,07,08,09,10,11,12',
'partition.month.type'='enum',
'partition.year.range'='2022,2100',
'partition.year.type'='integer',
'storage.location.template'='s3://bucket-name/logs${year}/${month}/${day}/')
As you can see, I'm trying to partition the data using year, month, and day. Even though there's also an hour folder, I'm not interested in that. This command executes just fine and it creates the table too. But when I query the table:
SELECT * FROM analytics.logs LIMIT 10;
It returns empty. But if I create the same table without the PARTITIONED part, I can see the records. Can someone please help me understand what I'm doing wrong?
[UPDATE]
I simplified the folder structure to see if it works. It does not.
$ aws s3 ls s3://bucket-name/test --recursive
2023-01-24 07:03:30 0 test/
2023-01-24 07:03:59 0 test/2022/
2023-01-24 07:11:06 13889 test/2022/Log-1-2022-12-01-00-00-11-255f8d74-5417-42a0-8c09-97282a626903
2023-01-24 07:11:05 8208 test/2022/Log-1-2022-12-01-00-05-15-c34eda24-36d8-484c-b7b6-4861c297d857
CREATE EXTERNAL TABLE `log_2`(
`id` string,
...
`type` tinyint)
PARTITIONED BY (
`year` bigint COMMENT '')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://bucket-name/test'
TBLPROPERTIES (
'classification'='csv',
'partition.enable'='true',
'partition.year.range'='2021,2023',
'partition.year.type'='integer',
'storage.location.template'='s3://bucket-name/test/${year}/')
And still the following query returns nothing:
SELECT * FROM "analytics"."log_2" where year = 2022 limit 10;
You have a mismatch in data types. Partition by year is bigint and partition projection is integer. Make both integers.
"projection.enabled" = "true",
"projection.datehour.type" = "date",
"projection.datehour.format" = "yyyy/MM/dd/HH",
"projection.datehour.range" = "2021/01/01/00,NOW",
"projection.datehour.interval" = "1",
"projection.datehour.interval.unit" = "HOURS",
Change the partition word to projection.
For anyone else who might make my mistake, the problem is that I was (incorrectly) using partition in the TBLPROPERTIES section. While it should have been projection.
To provide you with a working example:
CREATE EXTERNAL TABLE `log_2`(
id string,
...
type tinyint)
PARTITIONED BY (
`year` bigint COMMENT '')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://bucket-name/test'
TBLPROPERTIES (
'classification'='csv',
'projection.enable'='true',
'projection.year.range'='2021,2023',
'projection.year.type'='integer',
'storage.location.template'='s3://bucket-name/test/${year}/')

How to export hive table data into csv.gz format stored in s3

So I have two hive queries, one that creates the table and the other one that reads parquet data from another table and inserts the relevant columns into my new table. I would like this new hive table to export its data to an s3 location with data in csv.gz format. My hive queries running on emr are currently outputting 00000_0.gz and I have to rename them using a bash script to csv.gz. This is quite a hacky way as I have to mount my s3 directory into my terminal and it would be ideal if my queries could directly do this. Could someone please review my queries to see where if there's any fault, many thanks.
CREATE TABLE db.test (
app_id string,
app_account_id string,
sdk_ts BIGINT,
device_id string)
PARTITIONED BY (
load_date string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION "s3://test_unload/";
set hive.execution.engine=tez;
set hive.cli.print.header=true;
set hive.exec.compress.output=true;
set hive.merge.tezfiles=true;
set hive.merge.mapfiles=true;
set hive.merge.mapredfiles=true;
set hive.merge.smallfiles.avgsize=1024000000;
set hive.merge.size.per.task=1024000000;
set hive.exec.dynamic.partition.mode=nonstrict;
insert into db.test
partition(load_date)
select
'' as app_id,
'288' as app_account_id,
from_unixtime(CAST(event_epoch as BIGINT), 'yyyy-MM-dd HH:mm:ss') as sdk_ts,
device_id,
'20221106' as load_date
FROM processed_events.test
where load_date = '20221106'; ```

AWS Athena query returning empty string

I've seen other questions saying their query returns no results. This is not what is happening with my query. The query itself is returning empty strings/results.
I have an 81.7MB JSON file in my input bucket (input-data/test_data). I've setup the datasource as JSON.
However, when I execute SELECT * FROM test_table; it shows (in green) that the data has been scanned, the query was successful and there are results, but not saved to the output bucket or displayed in the GUI.
I'm not sure what I've done wrong in the setup?
This is my table creation:
CREATE EXTERNAL TABLE IF NOT EXISTS `test_db`.`test_data` (
`tbl_timestamp` timestamp,
`colmn1` string,
`colmn2` string,
`colmn3` string
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = '1'
) LOCATION 's3://input-data/test_data/'
TBLPROPERTIES ('has_encrypted_data'='false',
'skip.header.line.count'='1');
Resolved this issue. The labels of the table (e.g. the keys) need to be the same labels in the file itself. Simple really!

Need to skip CSV header when reading from s3

when i'm trying to load csv file from s3, headers are injecting into columns. i tried to skip header by
TBLPROPERTIES (
"skip.header.line.count"="1")
But still no use.
Any advice please?
CREATE EXTERNAL TABLE skipheader(
permalink string,
company string,
numemps bigint,
category string,
city string,
state string,
fundeddate string,
raisedamt bigint,
raisedcurrency string,
round string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://bucketname/filename/'
TBLPROPERTIES (
"skip.header.line.count"="1")
Looking at the release notes for when the feature was released it says
Support for ignoring headers. You can use the skip.header.line.count property when defining tables, to allow Athena to ignore headers. This is currently supported for queries that use the OpenCSV SerDe, and not for Grok or Regex SerDes.
My interpretation of this is that it won't work with LazySimpleSerde, which is what you get when you say ROW FORMAT DELIMITED, and that you have to use the OpenCSV serde:
CREATE EXTERNAL TABLE skipheader ( … )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES ('separatorChar' = ',')
STORED AS TEXTFILE
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://bucketname/filename/'
TBLPROPERTIES ("skip.header.line.count"="1")
The OpenCSV serde works differently from LazySimpleSerde, it has much more limited data type support, but on the other hand it is more configurable.
If you can use the OpenCSV SerDe and make it work for you like described by Theo, go for it. However, if you have other tables of other formats, you can get around it in the following way even though it is a bit of a hack. You can simply add a WHERE clause that excludes the headers like
SELECT * FROM skipheader WHERE permalink != 'permalink'. Recently, Athena added the ability to make a new table as result of query (https://docs.aws.amazon.com/athena/latest/ug/create-table-as.html) so if you could even filter out the headers and save to a new location using Athena if that was better for you.

How can I create a table with only some specific files (wildcard) using Amazon Athena?

My bucket used to have this structure:
mybucket/raw/i1.json
mybucket/raw/i2.json
It was easy and straightfoward to use Amazon Athena using the code below to create
the table.
CREATE EXTERNAL TABLE IF NOT EXISTS myclients.big_clients (
`id_number` string,
`txt` string,
...
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = '1',
) LOCATION 's3://mybucket/raw/'
TBLPROPERTIES ('has_encrypted_data'='false');
Now I'm facing some problems with a migration in the bucket structure.
The new structure in the bucket is showed below.
mybucket/raw/1/i1.json
mybucket/raw/1/docs/doc_1.json
mybucket/raw/1/docs/doc_2.json
mybucket/raw/1/docs/doc_3.json
mybucket/raw/2/i2.json
mybucket/raw/2/docs/doc_1.json
mybucket/raw/2/docs/doc_2.json
I wish I could create now two tables (the same table I had before the migration and a new one only with the docs.)
Is there any way I could do that without having to rearrange my files in another folder?
I'm searching for some kind of wildcard for the bucket files on the creation of the table.
CREATE EXTERNAL TABLE IF NOT EXISTS myclients.big_clients (
`id_number` string,
`txt` string,
...
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = '1',
'input.regex' = 'i*.json'
) LOCATION 's3://mybucket/raw/'
TBLPROPERTIES ('has_encrypted_data'='false');
CREATE EXTERNAL TABLE IF NOT EXISTS myclients.big_clients_docs (
`dt` date,
`txt` string,
`id_number` string,
`s3_doc_path` string,
`s3_doc_path_origin` string
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = '1',
'input.regex' = 'doc_*.json'
) LOCATION 's3://mybucket/raw/'
TBLPROPERTIES ('has_encrypted_data'='false');
I was looking for the same thing. Unfortunately this is not possible due to the s3 api not being that wildcard friendly (requires scanning all the keys client side, which is slow). The documentation for athena also states that this is not supported.