Yahoo Finance Historical data downloader url is not working - cookies

I have used the following url to fetch the historical data from yahoo finance. From last 16th May, 2017 the url is not working.
http://real-chart.finance.yahoo.com/table.csv?s=AAL&a=04&b=01&c=2017&d=04&e=02&f=2017&g=d&ignore=.csv
Seems like they have changed the url and the new url is:
https://query1.finance.yahoo.com/v7/finance/download/AAL?period1=1494873000&period2=1494959400&interval=1d&events=history&crumb=l0aEtuOKocj
In the above changed URL has a session cookie which is crumb. Is there any idea how to get this cookie programmatically(in JAVA)?

I recently wrote a simple python script to download the history of a single stock.
Here an example how to invoke it:
python get_quote_history.py --symbol=IBM --from=2017-01-01 --to=2017-05-25 -o IBM.csv
This will download IBM historical prices from 2017-01-01 to 2017-05-25 and save them in IBM.csv file.
import re
import urllib2
import calendar
import datetime
import getopt
import sys
import time
crumble_link = 'https://finance.yahoo.com/quote/{0}/history?p={0}'
crumble_regex = r'CrumbStore":{"crumb":"(.*?)"}'
cookie_regex = r'Set-Cookie: (.*?); '
quote_link = 'https://query1.finance.yahoo.com/v7/finance/download/{}?period1={}&period2={}&interval=1d&events=history&crumb={}'
def get_crumble_and_cookie(symbol):
link = crumble_link.format(symbol)
response = urllib2.urlopen(link)
match = re.search(cookie_regex, str(response.info()))
cookie_str = match.group(1)
text = response.read()
match = re.search(crumble_regex, text)
crumble_str = match.group(1)
return crumble_str, cookie_str
def download_quote(symbol, date_from, date_to):
time_stamp_from = calendar.timegm(datetime.datetime.strptime(date_from, "%Y-%m-%d").timetuple())
time_stamp_to = calendar.timegm(datetime.datetime.strptime(date_to, "%Y-%m-%d").timetuple())
attempts = 0
while attempts < 5:
crumble_str, cookie_str = get_crumble_and_cookie(symbol)
link = quote_link.format(symbol, time_stamp_from, time_stamp_to, crumble_str)
#print link
r = urllib2.Request(link, headers={'Cookie': cookie_str})
try:
response = urllib2.urlopen(r)
text = response.read()
print "{} downloaded".format(symbol)
return text
except urllib2.URLError:
print "{} failed at attempt # {}".format(symbol, attempts)
attempts += 1
time.sleep(2*attempts)
return ""
if __name__ == '__main__':
print get_crumble_and_cookie('KO')
from_arg = "from"
to_arg = "to"
symbol_arg = "symbol"
output_arg = "o"
opt_list = (from_arg+"=", to_arg+"=", symbol_arg+"=")
try:
options, args = getopt.getopt(sys.argv[1:],output_arg+":",opt_list)
except getopt.GetoptError as err:
print err
for opt, value in options:
if opt[2:] == from_arg:
from_val = value
elif opt[2:] == to_arg:
to_val = value
elif opt[2:] == symbol_arg:
symbol_val = value
elif opt[1:] == output_arg:
output_val = value
print "downloading {}".format(symbol_val)
text = download_quote(symbol_val, from_val, to_val)
with open(output_val, 'wb') as f:
f.write(text)
print "{} written to {}".format(symbol_val, output_val)

Andrea Galeazzi's excellent answer; with added options for splits and dividends, and twisted for python 3.
Also changed so "to:date" is included in the returned results, previously code returned up to but not including "to:date". Just different!
And be aware that Yahoo made minor changes in price rounding, column order, and split syntax.
## Downloaded from
## https://stackoverflow.com/questions/44044263/yahoo-finance-historical-data-downloader-url-is-not-working
## Modified for Python 3
## Added --event=history|div|split default = history
## changed so "to:date" is included in the returned results
## usage: download_quote(symbol, date_from, date_to, events).decode('utf-8')
import re
from urllib.request import urlopen, Request, URLError
import calendar
import datetime
import getopt
import sys
import time
crumble_link = 'https://finance.yahoo.com/quote/{0}/history?p={0}'
crumble_regex = r'CrumbStore":{"crumb":"(.*?)"}'
cookie_regex = r'Set-Cookie: (.*?); '
quote_link = 'https://query1.finance.yahoo.com/v7/finance/download/{}?period1={}&period2={}&interval=1d&events={}&crumb={}'
def get_crumble_and_cookie(symbol):
link = crumble_link.format(symbol)
response = urlopen(link)
match = re.search(cookie_regex, str(response.info()))
cookie_str = match.group(1)
text = response.read().decode("utf-8")
match = re.search(crumble_regex, text)
crumble_str = match.group(1)
return crumble_str , cookie_str
def download_quote(symbol, date_from, date_to,events):
time_stamp_from = calendar.timegm(datetime.datetime.strptime(date_from, "%Y-%m-%d").timetuple())
next_day = datetime.datetime.strptime(date_to, "%Y-%m-%d") + datetime.timedelta(days=1)
time_stamp_to = calendar.timegm(next_day.timetuple())
attempts = 0
while attempts < 5:
crumble_str, cookie_str = get_crumble_and_cookie(symbol)
link = quote_link.format(symbol, time_stamp_from, time_stamp_to, events,crumble_str)
#print link
r = Request(link, headers={'Cookie': cookie_str})
try:
response = urlopen(r)
text = response.read()
print ("{} downloaded".format(symbol))
return text
except URLError:
print ("{} failed at attempt # {}".format(symbol, attempts))
attempts += 1
time.sleep(2*attempts)
return b''
if __name__ == '__main__':
print (get_crumble_and_cookie('KO'))
from_arg = "from"
to_arg = "to"
symbol_arg = "symbol"
event_arg = "event"
output_arg = "o"
opt_list = (from_arg+"=", to_arg+"=", symbol_arg+"=", event_arg+"=")
try:
options, args = getopt.getopt(sys.argv[1:],output_arg+":",opt_list)
except getopt.GetoptError as err:
print (err)
symbol_val = ""
from_val = ""
to_val = ""
output_val = ""
event_val = "history"
for opt, value in options:
if opt[2:] == from_arg:
from_val = value
elif opt[2:] == to_arg:
to_val = value
elif opt[2:] == symbol_arg:
symbol_val = value
elif opt[2:] == event_arg:
event_val = value
elif opt[1:] == output_arg:
output_val = value
print ("downloading {}".format(symbol_val))
text = download_quote(symbol_val, from_val, to_val,event_val)
if text:
with open(output_val, 'wb') as f:
f.write(text)
print ("{} written to {}".format(symbol_val, output_val))

Got it to work, now I just have to parse the csv. Thought I'd share since I was having trouble with the syntax.
Dim crumb As String: crumb = "xxxx"
Dim cookie As String: cookie = "yyyy"
Dim urlStock As String: urlStock = "https://query1.finance.yahoo.com/v7/finance/download/SIRI?" & _
"period1=1274158800&" & _
"period2=1495059477&" & _
"interval=1d&events=history&crumb=" & crumb
Dim http As MSXML2.XMLHTTP: Set http = New MSXML2.ServerXMLHTTP
http.Open "GET", urlStock, False
http.setRequestHeader "Cookie", cookie
http.send

You can manually save the crumb/cookie pair in Chrome or you can use something like this to generate it. Then, just set the cookie header in java and pass the corresponding crumb in the URL

I have wrote a lightweight script that pulls together alot of the suggestions in this thread to fix this problem. https://github.com/AndrewRPorter/yahoo-historical
However, there are much better solutions such as, https://github.com/ranaroussi/fix-yahoo-finance
Hope these resources help!

I developed the following solution for this issue in Excel/VBA. The key challenge was the creation of the Crumb / Cookie pair. Once that is created you can re-use it for calls to Yahoo for the historical prices.
See here the key code for the Crumb / Cookie
Sub GetYahooRequest(strCrumb As String, strCookie As String)
'This routine will use a sample request to Yahoo to obtain a valid Cookie and Crumb
Dim strUrl As String: strUrl = "https://finance.yahoo.com/lookup?s=%7B0%7D"
Dim objRequest As WinHttp.WinHttpRequest
Set objRequest = New WinHttp.WinHttpRequest
With objRequest
.Open "GET", strUrl, True
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
.send
.waitForResponse
strCrumb = strExtractCrumb(.responseText)
strCookie = Split(.getResponseHeader("Set-Cookie"), ";")(0)
End With
End Sub
See the following Yahoo Historical Price Extract on my website for a Sample Excel workbook that demonstrates how to extract Yahoo Historical prices

Great answer Andrea, I have added to your code to allow for downloads of multiple stocks. (python 2.7)
file1: down.py
import os
myfile = open("ticker.csv", "r")
lines = myfile.readlines()
for line in lines:
ticker = line.strip();
cmd = "python get_quote_history.py --symbol=%s --from=2017-01-01 --to=2017-05-25 -o %s.csv" %(ticker,ticker)
os.system(cmd)
file2: ticker.csv
AAPL
MSFT
file3: get_quote_history.py

Related

Is there a way to generate the AWS Console URLs for CloudWatch Log Group filters?

I would like to send my users directly to a specific log group and filter but I need to be able to generate the proper URL format. For example, this URL
https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logsV2:log-groups/log-group/
%252Fmy%252Flog%252Fgroup%252Fgoes%252Fhere/log-events/$3FfilterPattern$3D$255Bincoming_ip$252C$2Buser_name$252C$2Buser_ip$2B$252C$2Btimestamp$252C$2Brequest$2B$2521$253D$2B$2522GET$2B$252Fhealth_checks$252Fall$2B*$2522$252C$2Bstatus_code$2B$253D$2B5*$2B$257C$257C$2Bstatus_code$2B$253D$2B429$252C$2Bbytes$252C$2Burl$252C$2Buser_agent$255D$26start$3D-172800000
will take you to a log group named /my/log/group/goes/here and filter messages with this pattern for the past 2 days:
[incoming_ip, user_name, user_ip , timestamp, request != "GET /health_checks/all *", status_code = 5* || status_code = 429, bytes, url, user_agent]
I can decode part of the URL but I don't know what some of the other characters should be (see below), but this doesn't really look like any standard HTML encoding to me. Does anyone know a encoder/decoder for this URL format?
%252F == /
$252C == ,
$255B == [
$255D == ]
$253D == =
$2521 == !
$2522 == "
$252F == _
$257C == |
$2B == +
$26 == &
$3D == =
$3F == ?
First of all I'd like to thank other guys for the clues. Further goes the complete explanation how Log Insights links are constructed.
Overall it's just weirdly encoded conjunction of an object structure that works like that:
Part after ?queryDetail= is object representation and {} are represented by ~()
Object is walked down to primitive values and the latter are transformed as following:
encodeURIComponent(value) so that all special characters are transformed to %xx
replace(/%/g, "*") so that this encoding is not affected by top level ones
if value type is string - it is prefixed with unmatched single quote
To illustrate:
"Hello world" -> "Hello%20world" -> "Hello*20world" -> "'Hello*20world"
Arrays of transformed primitives are joined using ~ and as well put inside ~() construct
Then, after primitives transformation is done - object is joined using "~".
After that string is escape()d (note that not encodeURIComponent() is called as it doesn't transform ~ in JS).
After that ?queryDetail= is added.
And finally this string us encodeURIComponent()ed and as a cherry on top - % is replaced with $.
Let's see how it works in practice. Say these are our query parameters:
const expression = `fields #timestamp, #message
| filter #message not like 'example'
| sort #timestamp asc
| limit 100`;
const logGroups = ["/application/sample1", "/application/sample2"];
const queryParameters = {
end: 0,
start: -3600,
timeType: "RELATIVE",
unit: "seconds",
editorString: expression,
isLiveTrail: false,
source: logGroups,
};
Firstly primitives are transformed:
const expression = "'fields*20*40timestamp*2C*20*40message*0A*20*20*20*20*7C*20filter*20*40message*20not*20like*20'example'*0A*20*20*20*20*7C*20sort*20*40timestamp*20asc*0A*20*20*20*20*7C*20limit*20100";
const logGroups = ["'*2Fapplication*2Fsample1", "'*2Fapplication*2Fsample2"];
const queryParameters = {
end: 0,
start: -3600,
timeType: "'RELATIVE",
unit: "'seconds",
editorString: expression,
isLiveTrail: false,
source: logGroups,
};
Then, object is joined using ~ so we have object representation string:
const objectString = "~(end~0~start~-3600~timeType~'RELATIVE~unit~'seconds~editorString~'fields*20*40timestamp*2C*20*40message*0A*20*20*20*20*7C*20filter*20*40message*20not*20like*20'example'*0A*20*20*20*20*7C*20sort*20*40timestamp*20asc*0A*20*20*20*20*7C*20limit*20100~isLiveTrail~false~source~(~'*2Fapplication*2Fsample1~'*2Fapplication*2Fsample2))"
Now we escape() it:
const escapedObject = "%7E%28end%7E0%7Estart%7E-3600%7EtimeType%7E%27RELATIVE%7Eunit%7E%27seconds%7EeditorString%7E%27fields*20*40timestamp*2C*20*40message*0A*20*20*20*20*7C*20filter*20*40message*20not*20like*20%27example%27*0A*20*20*20*20*7C*20sort*20*40timestamp*20asc*0A*20*20*20*20*7C*20limit*20100%7EisLiveTrail%7Efalse%7Esource%7E%28%7E%27*2Fapplication*2Fsample1%7E%27*2Fapplication*2Fsample2%29%29"
Now we append ?queryDetail= prefix:
const withQueryDetail = "?queryDetail=%7E%28end%7E0%7Estart%7E-3600%7EtimeType%7E%27RELATIVE%7Eunit%7E%27seconds%7EeditorString%7E%27fields*20*40timestamp*2C*20*40message*0A*20*20*20*20*7C*20filter*20*40message*20not*20like*20%27example%27*0A*20*20*20*20*7C*20sort*20*40timestamp*20asc*0A*20*20*20*20*7C*20limit*20100%7EisLiveTrail%7Efalse%7Esource%7E%28%7E%27*2Fapplication*2Fsample1%7E%27*2Fapplication*2Fsample2%29%29"
Finally we URLencode it and replace % with $ and vois la:
const result = "$3FqueryDetail$3D$257E$2528end$257E0$257Estart$257E-3600$257EtimeType$257E$2527RELATIVE$257Eunit$257E$2527seconds$257EeditorString$257E$2527fields*20*40timestamp*2C*20*40message*0A*20*20*20*20*7C*20filter*20*40message*20not*20like*20$2527example$2527*0A*20*20*20*20*7C*20sort*20*40timestamp*20asc*0A*20*20*20*20*7C*20limit*20100$257EisLiveTrail$257Efalse$257Esource$257E$2528$257E$2527*2Fapplication*2Fsample1$257E$2527*2Fapplication*2Fsample2$2529$2529"
And putting it all together:
function getInsightsUrl(queryDefinitionId, start, end, expression, sourceGroup, timeType = 'ABSOLUTE', region = 'eu-west-1') {
const p = m => escape(m);
const s = m => escape(m).replace(/%/gi, '*');
const queryDetail
= p('~(')
+ p("end~'")
+ s(end.toUTC().toISO()) // converted using Luxon
+ p("~start~'")
+ s(start.toUTC().toISO()) // converted using Luxon
// Or use UTC instead of Local
+ p(`~timeType~'${timeType}~tz~'Local~editorString~'`)
+ s(expression)
+ p('~isLiveTail~false~queryId~\'')
+ s(queryDefinitionId)
+ p("~source~(~'") + s(sourceGroup) + p(')')
+ p(')');
return `https://${region}.console.aws.amazon.com/cloudwatch/home?region=${region}#logsV2:logs-insights${escape(`?queryDetail=${queryDetail}`).replace(/%/gi, '$')}`;
}
Of course reverse operation can be performed as well.
That's all folks. Have fun, take care and try to avoid doing such a weird stuff yourselves. :)
I had to do a similar thing to generate a back link to the logs for a lambda and did the following hackish thing to create the link:
const link = `https://${process.env.AWS_REGION}.console.aws.amazon.com/cloudwatch/home?region=${process.env.AWS_REGION}#logsV2:log-groups/log-group/${process.env.AWS_LAMBDA_LOG_GROUP_NAME.replace(/\//g, '$252F')}/log-events/${process.env.AWS_LAMBDA_LOG_STREAM_NAME.replace('$', '$2524').replace('[', '$255B').replace(']', '$255D').replace(/\//g, '$252F')}`
A colleague of mine figured out that the encoding is nothing special. It is the standard URI percent encoding but applied twice (2x). In javascript you can use the encodeURIComponent function to test this out:
let inp = 'https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logsV2:log-groups/log-group/'
console.log(encodeURIComponent(inp))
console.log(encodeURIComponent(encodeURIComponent(inp)))
This piece of javascript produces the expected output on the second encoding stage:
https%3A%2F%2Fconsole.aws.amazon.com%2Fcloudwatch%2Fhome%3Fregion%3Dus-east-1%23logsV2%3Alog-groups%2Flog-group%2F
https%253A%252F%252Fconsole.aws.amazon.com%252Fcloudwatch%252Fhome%253Fregion%253Dus-east-1%2523logsV2%253Alog-groups%252Flog-group%252F
Caution
At least some bits use the double encoding, not the whole link though. Otherwise all special characters would occupy 4 characters after double encoding, but some still occupy only 2 characters. Hope this helps anyway ;)
My complete Javascript solution based on #isaias-b answer, which also adds a timestamp filter on the logs:
const logBaseUrl = 'https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logsV2:log-groups/log-group';
const encode = text => encodeURIComponent(text).replace(/%/g, '$');
const awsEncode = text => encodeURIComponent(encodeURIComponent(text)).replace(/%/g, '$');
const encodeTimestamp = timestamp => encode('?start=') + awsEncode(new Date(timestamp).toJSON());
const awsLambdaLogBaseUrl = `${logBaseUrl}/${awsEncode('/aws/lambda/')}`;
const logStreamUrl = (logGroup, logStream, timestamp) =>
`${awsLambdaLogBaseUrl}${logGroup}/log-events/${awsEncode(logStream)}${timestamp ? encodeTimestamp(timestamp) : ''}`;
I have created a bit of Ruby code that seems to satisfy the CloudWatch URL parser. I'm not sure why you have to double escape some things and then replace % with $ in others. I'm guessing there is some reason behind it but I couldn't figure out a nice way to do it, so I'm just brute forcing it. If you have something better, or know why they do this, please add a comment.
NOTE: The filter I tested with is kinda basic and I'm not sure what might need to change if you get really fancy with it.
# Basic URL that is the same across all requests
url = 'https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logsV2:log-groups/log-group/'
# CloudWatch log group
log_group = '/aws/my/log/group'
# Either specify the instance you want to search or leave it out to search all instances
instance = '/log-events/i-xxxxxxxxxxxx'
OR
instance = '/log-events'
# The filter to apply.
filter = '[incoming_ip, user_name, user_ip , timestamp, request, status_code = 5*, bytes, url, user_agent]'
# Start time. There might be an End time as well but my queries haven't used
# that yet so I'm not sure how it's formatted. It should be pretty similar
# though.
hours = 48
start = "&start=-#{hours*60*60*1000}"
# This will get you the final URL
final = url + CGI.escape(CGI.escape(log_group)) + instance + '$3FfilterPattern$3D' + CGI.escape(CGI.escape(filter)).gsub('%','$') + CGI.escape(start).gsub('%','$')
A bit late but here is a python implementation
def get_cloud_watch_search_url(search, log_group, log_stream, region=None,):
"""Return a properly formatted url string for search cloud watch logs
search = "{$.message: "You are amazing"}
log_group = Is the group of message you want to search
log_stream = The stream of logs to search
"""
url = f'https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}'
def aws_encode(value):
"""The heart of this is that AWS likes to quote things twice with some substitution"""
value = urllib.parse.quote_plus(value)
value = re.sub(r"\+", " ", value)
return re.sub(r"%", "$", urllib.parse.quote_plus(value))
bookmark = '#logsV2:log-groups'
bookmark += '/log-group/' + aws_encode(log_group)
bookmark += "/log-events/" + log_stream
bookmark += re.sub(r"%", "$", urllib.parse.quote("?filterPattern="))
bookmark += aws_encode(search)
return url + bookmark
This then allows you to quickly verify it.
>>> real = 'https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups/log-group/$252Fapp$252Fdjango/log-events/production$3FfilterPattern$3D$257B$2524.msg$253D$2522$2525s$2525s+messages+to+$2525s+pk$253D$2525d...$2522$257D'
>>> constructed = get_cloud_watch_search_url(None, search='{$.msg="%s%s messages to %s pk=%d..."}', log_group='/app/django', log_stream='production', region='us-west-2')
>>> real == constructed
True
I encountered this problem recently when I wanted to generate cloudwatch insights URL. Typescript version below:
export function getInsightsUrl(
start: Date,
end: Date,
query: string,
sourceGroup: string,
region = "us-east-1"
) {
const p = (m: string) => escape(m);
// encodes inner values
const s = (m: string) => escape(m).replace(/\%/gi, "*");
const queryDetail =
p(`~(end~'`) +
s(end.toISOString()) +
p(`~start~'`) +
s(start.toISOString()) +
p(`~timeType~'ABSOLUTE~tz~'UTC~editorString~'`) +
s(query) +
p(`~isLiveTail~false~queryId~'`) +
s(v4()) +
p(`~source~(~'`) +
s(sourceGroup) +
p(`))`);
return (
`https://console.aws.amazon.com/cloudwatch/home?region=${region}#logsV2:logs-insights` +
escape("?queryDetail=" + queryDetail).replace(/\%/gi, "$")
);
}
Github GIST
A Python solution based on #Pål Brattberg's answer:
cloudwatch_log_template = "https://{AWS_REGION}.console.aws.amazon.com/cloudwatch/home?region={AWS_REGION}#logsV2:log-groups/log-group/{LOG_GROUP_NAME}/log-events/{LOG_STREAM_NAME}"
log_url = cloudwatch_log_template.format(
AWS_REGION=AWS_REGION, LOG_GROUP_NAME=CLOUDWATCH_LOG_GROUP, LOG_STREAM_NAME=LOG_STREAM_NAME
)
Make sure to substitute illegal characters first (see OP) if you used any.
I encountered this problem recently when I wanted to generate cloudwatch insights URL. PHP version below:
<?php
function getInsightsUrl($region = 'ap-northeast-1') {
// https://stackoverflow.com/questions/67734825/why-is-laravels-carbon-toisostring-different-from-javascripts-toisostring
$start = now()->subMinutes(2)->format('Y-m-d\TH:i:s.v\Z');
$end = now()->addMinutes(2)->format('Y-m-d\TH:i:s.v\Z');
$filter = 'INFO';
$logStream = 'xxx_backend_web';
$sourceGroup = '/ecs/xxx_backend_prod';
// $sourceGroup = '/aws/ecs/xxx_backend~\'/ecs/xxx_backend_dev'; // multiple source group
$query =
"fields #timestamp, #message \n" .
"| sort #timestamp desc\n" .
"| filter #logStream like '$logStream'\n" .
"| filter #message like '$filter'\n" .
"| limit 20";
$queryDetail = urlencode(
("~(end~'") .
($end) .
("~start~'") .
($start) .
("~timeType~'ABSOLUTE~tz~'Local~editorString~'") .
($query) .
("~isLiveTail~false~queryId~'") .
("~source~(~'") .
($sourceGroup) .
("))")
);
$queryDetail = preg_replace('/\%/', '$', urlencode("?queryDetail=" . $queryDetail));
return
"https://console.aws.amazon.com/cloudwatch/home?region=${region}#logsV2:logs-insights"
. $queryDetail;
}
A coworker came up with the following JavaScript solution.
import JSURL from 'jsurl';
const QUERY = {
end: 0,
start: -3600,
timeType: 'RELATIVE',
unit: 'seconds',
editorString: "fields #timestamp, #message, #logStream, #log\n| sort #timestamp desc\n| limit 200\n| stats count() by bin(30s)",
source: ['/aws/lambda/simpleFn'],
};
function toLogsUrl(query) {
return `#logsV2:logs-insights?queryDetail=${JSURL.stringify(query)}`;
}
toLogsUrl(QUERY);
// #logsV2:logs-insights?queryDetail=~(end~0~start~-3600~timeType~'RELATIVE~unit~'seconds~editorString~'fields*20*40timestamp*2c*20*40message*2c*20*40logStream*2c*20*40log*0a*7c*20sort*20*40timestamp*20desc*0a*7c*20limit*20200*0a*7c*20stats*20count*28*29*20by*20bin*2830s*29~source~(~'*2faws*2flambda*2fsimpleFn))
I HAVE to elevate #WayneB's answer above bc it just works. No encoding required - just follow his template. I just confirmed it works for me. Here's what he said in one of the comments above:
"Apparently there is an easier link which does the encoding/replacement for you: https://console.aws.amazon.com/cloudwatch/home?region=${process.env.AWS_REGION}#logEventViewer:group=${logGroup};stream=${logStream}"
Thanks for this answer Wayne - just wish I saw it sooner!
Since Python contributions relate to log-groups, and not to log-insights, this is my contribution. I guess that I could have done better with the inner functions though, but it is a good starting point:
from datetime import datetime, timedelta
import re
from urllib.parse import quote
def get_aws_cloudwatch_log_insights(query_parameters, aws_region):
def quote_string(input_str):
return f"""{quote(input_str, safe="~()'*").replace('%', '*')}"""
def quote_list(input_list):
quoted_list = ""
for item in input_list:
if isinstance(item, str):
item = f"'{item}"
quoted_list += f"~{item}"
return f"({quoted_list})"
params = []
for key, value in query_parameters.items():
if key == "editorString":
value = "'" + quote(value)
value = value.replace('%', '*')
elif isinstance(value, str):
value = "'" + value
if isinstance(value, bool):
value = str(value).lower()
elif isinstance(value, list):
value = quote_list(value)
params += [key, str(value)]
object_string = quote_string("~(" + "~".join(params) + ")")
scaped_object = quote(object_string, safe="*").replace("~", "%7E")
with_query_detail = "?queryDetail=" + scaped_object
result = quote(with_query_detail, safe="*").replace("%", "$")
final_url = f"https://{aws_region}.console.aws.amazon.com/cloudwatch/home?region={aws_region}#logsV2:logs-insights{result}"
return final_url
Example:
aws_region = "eu-west-1"
query = """fields #timestamp, #message
| filter #message not like 'example'
| sort #timestamp asc
| limit 100"""
log_groups = ["/application/sample1", "/application/sample2"]
query_parameters = {
"end": datetime.utcnow().isoformat(timespec='milliseconds') + "Z",
"start": (datetime.utcnow() - timedelta(days=2)).isoformat(timespec='milliseconds') + "Z",
"timeType": "ABSOLUTE",
"unit": "seconds",
"editorString": query,
"isLiveTrail": False,
"source": log_groups,
}
print(get_aws_cloudwatch_log_insights(query_parameters, aws_region))
Yet another Python solution:
from urllib.parse import quote
def aws_quote(s):
return quote(quote(s, safe="")).replace("%", "$")
def aws_cloudwatch_url(region, log_group, log_stream):
return "/".join([
f"https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}#logsV2:log-groups",
"log-group",
aws_quote(log_group),
"log-events",
aws_quote(log_stream),
])
aws_cloudwatch_url("ap-southeast-2", "/var/log/syslog", "process/pid=1")
https://ap-southeast-2.console.aws.amazon.com/cloudwatch/home?region=ap-southeast-2#logsV2:log-groups/log-group/$252Fvar$252Flog$252Fsyslog/log-events/process$252Fpid$253D1

Text file value replace in python

I am trying to replace text value as below. I have 2 text file
1 - input.txt
abc = 123
xyz = 456
pqr = 789
2 - content.txt
AAA = abc
XXX = xyz
PPP = pqr
now I need to read the input.txt file and replace value on content.txt file with input.txt values and get the below output file.
3 - new.txt
AAA = 123
XXX = 456
PPP = 789
How can I do this ?
First read the contents of the file into 2 arrays in the following way
file1handle = open('filename1', 'r')
file1 = file1handle.readlines()
file2handle = open('filename2', 'r')
file2 = file2handle.readlines()
file2handle.close()
file2handle.close()
Then iterate over the contents and try finding the match with variable names and assignments and put the values into third array in following way
for item in file1:
name, value = item.split(' = ')
for item2 in file2:
name2, assignment = item2.split(' = ')
#Here we are trying to check which name is to be assigned which value
if assignment == name:
val = name2+'='+value
file3.append(val)
Then write the contents into file in following way
filehandle3 = open('filename3', 'w')
for line in file3:
filehandle3.write(line)
filehandle3.close()
This may help you,
_input = {}
with open('input.txt', 'r') as f:
s = f.read()
_input = dict((a.split(' = ')[0], int(a.split(' = ')[1])) for a in s.split('\n'))
_content = {}
with open('content.txt', 'r') as f:
s = f.read()
_content = dict((a.split(' = ')[0], a.split(' = ')[1]) for a in s.split('\n'))
for key in _content:
_content[key] = _input[_content[key]]
Result:
In [18]: _content
Out[19]: {'AAA': 123, 'PPP': 789, 'XXX': 456}
How about using pandas: It's shorter, easier to read and faster when using large files.
import pandas as pd
import numpy as np
input=pd.read_csv("input.txt",sep="=",header=None,usecols=[1])
content=pd.read_csv("content.txt",sep="=",header=None,usecols=[0])
foo=np.hstack(([content.values,input.values]))
new=pd.DataFrame(foo)
new.to_csv("new.txt",index=False,sep="=",header=None)
import re
class Defs:
def __init__(self, defs_file):
self._defs = {}
with open(defs_file) as df:
line_num = 0
for l in df:
line_num += 1
m = re.match(r'\s*(\w+)\s*=\s*(\S+)\s*', l)
assert m, \
"invalid assignment syntax with \"{}\" at line {}".format(
l.rstrip(), line_num)
self._defs[m.group(1)] = m.group(2)
def __getitem__(self, var):
return self._defs[var]
#property
def dict(self):
return self._defs
class Replacer:
def __init__(self, defs):
self._defs = defs
def replace_with_defs(self, context_file, output_file):
with open(context_file) as context, open(output_file, 'w') as output:
for line in context:
string_repl = re.sub(r'\b(\w+)\b',
lambda m: self._defs.dict.get(m.group(1)) or m.group(1), line)
output.write(string_repl)
def main():
defs = Defs('input.txt')
repl = Replacer(defs)
repl.replace_with_defs('context.txt', 'output.txt')
if __name__ == '__main__':
main()
To describe what's going on above, the Defs class takes a defs_file which is the input.txt assignments and stores them in a dict binding each variable name to the associated value. The Replacer class handles takes a Defs object and uses those to iterate over each line in the context_file i.e. context.txt and replaces any token (assuming the token is a variable name) with the value associated with it, specified within the Defs object, and writes this out to a file output_file i.e. output.txt. If the token doesn't exist in the Defs object as a valid variable name, it defaults to the write the token as is.

"AttributeError: 'NoneType' object has no attribute 'split'"

def get_par(filename_or_pard):
# automatically search for the path
Path = GetPathOsmosisPth()
f = open(Path,'r')
TheLines = f.readlines()
Path = TheLines[0][0:-1] + '/osmosis/'
f.close()
# Get the name of the path to the data
ThePath = locate_dir('GeneralParameters',Path)
Thepdata = ThePath.split('GeneralParameters')[0] #(line 216 - This line gives error)
# check if we do not have provided an external data path
DataDir = CheckInputDataDirectory()
#DataDir = 'Data_CS'
if DataDir is None:
pdata = Thepdata
else:
pdata = os.path.join(Path,DataDir)
# search for the file
if isstring(filename_or_pard):
# myprintv("isstring: ", filename_or_pard)
# creating the dictionary
ThePath = locate(filename_or_pard,pdata)
f = ThePath + os.path.sep + filename_or_pard
pard = create_par_structure(f)
# creating the class par_struct
par = par_struct(pard)
# store the filename with the parameters in the par structure
par.TheFilename = filename_or_pard
else:
# myprint2v("not isstring: ", filename_or_pard, type(filename_or_pard))
# the dictionary is provided as input
pard = filename_or_pard
# creating the class par_struct
if isdict(pard):
par = par_struct(pard)
par._FromStringToTrueType()
else:
par = pard
# if parameters.txt, set the path_data
if hasattr(par,'path_data'):
par.path_data = pdata
par.root_path = Path
# myprintv("pdata: ", par.path_data)
if hasattr(par,'path_packages'):
par.path_packages = os.path.join(Path,par.path_packages)
# returning the result
return par
When i run my program i get an error with both split methods used above. The error displayed is
line 216, in get_par
Thepdata = ThePath.split('GeneralParameters', Path)[0]
AttributeError: 'NoneType' object has no attribute 'split'
I believe i am making a small error but i dont know, just starting programing.
Thanking you in advance
Your function locate_dir returns None. This happens if pattern never matches a path. Since you use your own print function myprinte, you are probably suppressing your error message.
This is assuming your locate_dir is currently formatted as below.
Formatted based on your comment:
def locate_dir(pattern, r=os.curdir):
ok = 0
for path, dirs, files in os.walk(os.path.abspath(r)):
if pattern in path:
ok = 1
return path
if ok == 0: # ok will always be 0 here, you would have left the loop otherwise
myprinte("Warning in locate: No directory found!")
# end of function *implicitly* returns None (i.e. returns "Nothing")
Note that you should raise an Exception if your code fails to produce a result that other code requires.
def locate_dir(pattern, r=os.curdir):
for path, dirs, files in os.walk(os.path.abspath(r)):
if pattern in path:
return path
raise FileNotFoundError("No file found for pattern %s" % pattern)
This way, you do not accidentally suppress errors.

Python 2 to 3 port - Still has one more error

I am porting a program called markovgenerator I found on the web from Python2 to Python3. It all seems to work just fine.
Here is the code: (Python3 version)
import random
class Markov:
def __init__(self, file, size):
self.size = size
self.starts = []
self.cache = {}
self.file_to_words(file)
self.parse_words()
def file_to_words(self, file):
file.seek(0)
data = file.read()
self.words = data.split("\n")
def tuples(self, word):
if len(word) < self.size - 1:
return
word = word + "\n"
for i in range(len(word) - self.size):
yield (word[i:i+self.size], word[i+self.size])
def parse_words(self):
for word in self.words:
self.starts.append(word[:self.size])
for key, next in self.tuples(word):
if key in self.cache:
self.cache[key].append(next)
else:
self.cache[key] = [next]
def generate_word(self):
key = random.choice(self.starts)
word = key
next = random.choice(self.cache[key])
while next != "\n":
word = word + next
key = key[1:] + next
next = random.choice(self.cache[key])
return word
from optparse import OptionParser
def main():
parser = OptionParser()
parser.add_option('-p', type='int', dest='prev_num', default=3,
help='number of previous letters to base chain on')
parser.add_option('-n', type='int', dest='num', default=5,
help='number of generated words')
parser.add_option('-s', '--source-text', type='string',
default='wordlist-en.txt', dest='source',
help='file to use as basis for generating the words')
(options, args) = parser.parse_args()
file = open('alice.txt')
markov = Markov(file, options.prev_num)
file.close()
for i in range(options.num):
print(markov.generate_word())
if __name__ == '__main__':
main()
Except I get this error:
next = random.choice(self.cache[key])
KeyError: ''
The error appears in the "generate_word()" function.
It must be from the translation to 3. Any ideas? I dont see why I am getting a key error, as I pass key to other places no problem.
Thanks for the help!!!
This fixes that error, and ignores any blank lines outputted:
import random
class Markov:
def __init__(self, file, size):
self.size = size
self.starts = []
self.cache = {}
self.file_to_words(file)
self.parse_words()
self.cache[''] = '\n'
def file_to_words(self, file):
file.seek(0)
data = file.read()
self.words = data.split("\n")
def tuples(self, word):
if len(word) < self.size - 1:
return
word = word + "\n"
for i in range(len(word) - self.size):
yield (word[i:i+self.size], word[i+self.size])
def parse_words(self):
for word in self.words:
self.starts.append(word[:self.size])
for key, next in self.tuples(word):
if key in self.cache:
self.cache[key].append(next)
else:
self.cache[key] = [next]
def generate_word(self):
key = random.choice(self.starts)
word = key
next = random.choice(self.cache[key])
while not next == "\n":
word = word + next
key = key[1:] + next
next = random.choice(self.cache[key])
return word
from optparse import OptionParser
def main():
parser = OptionParser()
parser.add_option('-p', type='int', dest='prev_num', default=3,
help='number of previous letters to base chain on')
parser.add_option('-n', type='int', dest='num', default=5,
help='number of generated words')
parser.add_option('-s', '--source-text', type='string',
default='wordlist-en.txt', dest='source',
help='file to use as basis for generating the words')
(options, args) = parser.parse_args()
file = open('alice.txt')
markov = Markov(file, options.prev_num)
file.close()
iters = 0
while iters < options.num:
word = markov.generate_word()
if word != '\n' and word != '':
print(word)
iters += 1
if __name__ == '__main__':
main()
For some reason, the string '', which raises KeyError when you try to use it in the dictionary cache, was registering as a word. Everything I tried to remove it caused the program to break, so I added a line to __init__ which sets the next word of '' to \n, giving the intended result by quitting when we see a newline.
If there's anything wrong with this code, let me know and I will be happy to fix it.

Webscraping - why do I only get the LAST row in a HTML table? BeautifulSoup

I'm trying to scrape a bunch of HTML files I have in a folder on my computer. The data I want is stored in a table, and I'm able to get the last row in the table, from each, file, but the other rows are ignored!
I've copied parts of the HTML to Pastebin, here: http://pastebin.com/hajr8SFi
This is the code I have so far. Again, it works with the last row, but not the others. So I guess there's a problem with the loop? I've tried to figure it out, but no results so far :(
def processData( pageFile ):
f = open(pageFile, "r")
page = f.read()
f.close()
soup = BeautifulSoup(page)
ewo = soup.find_all("td", {"class":"date"})
ewo2 = soup.find_all("td", {"class":"user"})
ewo3 = soup.find_all("p", {"class":"single"})
fishs = [ ]
dogs = [ ]
rats = [ ]
for html in ewo:
feedbacks = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "") # convert the html to text
fishs.append(feedbacks.encode("utf-8").strip())
for html2 in ewo2:
feedbacks2 = BeautifulSoup(str(html2).strip()).get_text().encode("utf-8").replace("\n", "") # convert the html to text
dogs.append(feedbacks2.encode("utf-8").strip())
str1 = ''.join(dogs)
for html3 in ewo3:
feedbacks3 = BeautifulSoup(str(html3).strip()).encode("utf-8").replace("\n", "") # convert the html to text
rats.append(feedbacks3.encode("utf-8").split('<p class="single">')[1].split("</p>")[0].strip())
csvfile = open(today + ' evo.csv', 'ab')
writer = csv.writer(csvfile)
for fish, dog, rat in zip(fishs, dogs, rats):
writer.writerow([fish, dog, rat])
csvfile.close()
today = datetime.datetime.now().strftime('%Y-%m-%d')
dir = "files/"
csvFile = today + " file.csv"
csvfile = open(csvFile, 'wb')
writer = csv.writer(csvfile)
writer.writerow(["F", "I", "V"])
csvfile.close()
fileList = os.listdir(dir)
totalLen = len(fileList)
count = 1
for htmlFile in fileList:
path = os.path.join(dir, htmlFile) # get the file path
processData(path) # process the data in the file
print "Processed '" + path + "'(" + str(count) + "/" + str(totalLen) + ")..." # display status
count = count + 1 # incriment counter