Ruby - update instance variable from inside a block - ruby-2.0

How do you update an instance variable from inside a block?
E.g.
def initialize_people(people)
people.each do |person|
person = "Bob" if person.nil?
end
end
#first = "Adam"
#second = "Eve"
#third = nil
people = [#first, #second, #third]
initialize_people(people)
puts people
# outputs
# Adam
# Eve
# would like it to output
# Adam
# Eve
# Bob

The expression:
people.each |person|
Defines person locally to the loop. So, all your loop did is set the locally scoped person to Bob when it was found to be nil. You could do something like this:
def initialize_people(people)
people.each_with_index do |person, i|
people[i] = "Bob" if person.nil?
end
end
Or shorter:
i = people.find_index(nil)
people[i] = "Bob" if i

Related

How to parse thorugh a list of links using a loop using beautiful soup?

I am newbie in coding so please go easy. I have this work thing that I want to automate a bit.
I have to collect data from this government website.
But I think it is designed this way to prevent it from bots and DDOS attacks. What I have to do as part of my work is to click individually on these links, record the case
Filling date
Last listed date
Party name
Case status
If case status ='disposed' then I read and check the pdfs for reason of disposal. (which can't automated).
Now, I have to go through many pages of this and its hard to even copy paste this much info. So for the first 4 details, I tried to create a script. One of them retrieves the hyperlinks from table page, and the second script to go through the list of links and get the listed above details. Its the second script where I am facing the problems.
List of changes in the URL:
case_list = [
"case-details?bench=YW1yYXZhdGk=&filing_no=MjgxMjEyOTAwMjA4MjAyMA==",
"case-details?bench=YW1yYXZhdGk=&filing_no=MjgxMjEyOTAwMjA5MjAyMA==",
"case-details?bench=YW1yYXZhdGk=&filing_no=MjgxMjEyOTAwMjEwMjAyMA==",
"case-details?bench=YW1yYXZhdGk=&filing_no=MjgxMjEyOTAwMjExMjAyMQ==",
"case-details?bench=YW1yYXZhdGk=&filing_no=MjgxMjEyOTAwMjEyMjAyMQ==",
"case-details?bench=YW1yYXZhdGk=&filing_no=MjgxMjEyOTAwMjEzMjAyMA==",
"case-details?bench=YW1yYXZhdGk=&filing_no=MjgxMjEyOTAwMjE0MjAyMA==",
"case-details?bench=YW1yYXZhdGk=&filing_no=MjgxMjEyOTAwMjE2MjAyMQ==",
"case-details?bench=YW1yYXZhdGk=&filing_no=MjgxMjEyOTAwMjE3MjAyMA==",
"case-details?bench=YW1yYXZhdGk=&filing_no=MjgxMjEyOTAwMjIwMjAyMA=="]
URL:
response= requests.get(url)
soup= BeautifulSoup(response, 'html.parser').text
tr= soup.find_all('td')
status = tr[19].text
#gets Filing date
filing_date = tr[3].text
#gets title
case_title = tr[5].text
#gets case disposed date
disposal_date = tr[15].text
Function for grabbing details from the URL:
def get_case_components(url):
response = requests.get(url).text
soup = BeautifulSoup(response, 'html.parser')
tr = soup.find_all('td')
status = tr[19].text
#gets Filing date
filing_date = tr[3].text
#gets title
case_title = tr[5].text
#gets disposed date
disposal_date = tr[15].text
return filing_date, case_title, status, disposal_date
Function for appending the DataFrame:
def get_case(df):
# loop for going through the case_list
for links in url_list :
url = links
#putting the URL in the function
get_case_components(url)
df = df.append( {"filing_date": filing_date,
"case_title" : case_title, "status": status,
"disposal_date": disposal_date}, ignore_index=True)
time.sleep(1)
return df
Calling the get_info() function using Dataframe.
df= pd.DataFrame(columns = ["filing_date", "case_title", "status", "disposal_date"])
df = get_case(df)
df.head()
For some reason I just keep getting the same thing over and over again as output, like only one case fills the entire Dataframe.
0 14-12-2020 Rajani Jagarlamudi VS Sharadakrupa Cold Storag... Pending 16-03-2022 \t
1 14-12-2020 Rajani Jagarlamudi VS Sharadakrupa Cold Storag... Pending 16-03-2022 \t
2 14-12-2020 Rajani Jagarlamudi VS Sharadakrupa Cold Storag... Pending 16-03-2022 \t
3 14-12-2020 Rajani Jagarlamudi VS Sharadakrupa Cold Storag... Pending 16-03-2022 \t
4 14-12-2020 Rajani Jagarlamudi VS Sharadakrupa Cold Storag... Pending 16-03-2022 \t
I made a script that gets the cases and stores them in an array.
The problem in your script was that you were manually selecting the cases when you were doing:
tr = soup.find_all('td')
status = tr[19].text
#gets Filing date
filing_date = tr[3].text
#gets title
case_title = tr[5].text
#gets disposed date
disposal_date = tr[15].text
My code dynamically scrapes them, so you don't have to worry about it.
import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass
HOST = "https://nclt.gov.in/"
LINK = "https://nclt.gov.in/order-judgement-date-wise-search?bench=Y2hlbm5haQ%3D%3D&start_date=MDEvMDEvMjAyMQ%3D%3D&end_date=MDEvMDEvMjAyMg%3D%3D"
#dataclass
class Case:
number: str
filing_num: str
case_no: str
pvr: str
listing_date: str
status: str
def get_cases(url):
res = requests.get(url)
if res.status_code == 200:
print("getting cases")
return res.text
def extract_case(html):
soup = BeautifulSoup(html, "html.parser")
cases = [Case(*[td.text for td in tr.select("td")]) for tr in soup.select("table tbody tr")]
next, *rest = [link["href"] for link in soup.select(".page-link") if link.text.strip() == "Next"]
if len(next):
next = HOST + next
return cases, next
def main():
total_cases = []
next = ""
amount = int(input("How many pages do you want to scrape?"))
oamount = amount
while amount:
if len(next) and oamount != amount:
html = get_cases(next)
else:
html = get_cases(LINK)
cases, next = extract_case(html)
total_cases.extend(cases)
amount -= 1
print(cases, next)
print("Scraped", len(total_cases), "cases")
if __name__ == "__main__":
main()

Reopening class and adding instance variables

I am teaching myself Crystal-lang and I came across a section in the documentation that I don't quite understand.
Here is the documetation page.
On that page it gives the following code:
class Person
#age = 0
def initialize(#name : String)
end
end
This is followed by the following statement:
This will initialize #age to zero in every constructor. This is useful to avoid duplication, but also to avoid the Nil type when reopening a class and adding instance variables to it.
Can someone please explain, or show me an example of the bolded behaviour? I'm not certain I understand what it means by "reopening a class and adding an instance variable to it".
Here is an example of reopening a class, and adding an instance variable to it:
class Person
#age = 0
def initialize(#name : String)
end
end
# Usually in another file
class Person
def gender=(gender : String)
#gender = gender
end
def gender
#gender
end
end
person = Person.new("RX14")
typeof(person.gender) # => String | Nil
person.gender # => nil
person.gender = "???"
person.gender # => "???"
We add the #gender instance variable, which is not initialized in the def initialize. The compiler infers the type of #gender to be String | Nil, since it is assigned to a string in gender=, but it is not initialized in the constructor, meaning it can also be nil.
However, we can add a default value to the #gender instance variable, which applies to all constructors, define before or after the default:
class Person
#age = 0
def initialize(#name : String)
end
end
# Usually in another file
class Person
#gender = "unknown"
def gender=(gender : String)
#gender = gender
end
def gender
#gender
end
end
person = Person.new("RX14")
typeof(person.gender) # => String
person.gender # => "unknown"
person.gender = "???"
person.gender # => "???"
This avoids the #gender variable getting the String | Nil type, since it is initialized to "unknown" when Person is constructed. Since Nil types are often avoided, this is an important tool to have.

How to search a sentence in tweets

I am trying to gather dataset from Twitter accounts who posted a status update in the form of a statement of diagnosis, such as “I was diagnosed with X
today”, where X would represent either depression.
I was being able to use TwitterSearch library but it only searches for the keyword not a full sentence.
from TwitterSearch import *
try:
tso = TwitterSearchOrder() # create a TwitterSearchOrder object
tso.set_keywords(['depression', 'diagnosed']) # let's define all words we would like to have a look for
tso.set_language('en') # we want to see English tweets only
tso.set_include_entities(False) # and don't give us all those entity information
ts = TwitterSearch(
consumer_key = 'x',
consumer_secret = 'y',
access_token = 'z',
access_token_secret = 't'
)
print( tweet['user']['screen_name'], tweet['text'] )
However I would like to use regular expression to get tweets that match the sentence.
You can search full sentences - not only keywords - with set_keywords
from TwitterSearch import *
try:
tso = TwitterSearchOrder() # create a TwitterSearchOrder object
tso.set_keywords(['I was diagnosed with depression today'])
tso.set_language('en') # we want to see English tweets only
tso.set_include_entities(False)
ts = TwitterSearch(
consumer_key = 'c',
consumer_secret = 's',
access_token = 'at',
access_token_secret = 'ats'
)
# this is where the fun actually starts :)
for tweet in ts.search_tweets_iterable(tso):
print( '#%s tweeted: %s' % ( tweet['user']['screen_name'], tweet['text'] ) )
except TwitterSearchException as e: # take care of all those ugly errors if there are some
print(e)
So, no need to filter the result with regex.

Dont know why the below script would not crawl glassdoor.com

Dont know why the below python script would not crawl glassdoor.com website
from bs4 import BeautifulSoup # documentation available at :` #www.crummy.com/software/BeautifulSoup/bs4/doc/
from bs4 import NavigableString, Tag
import requests # To send http requests and access the page : docs.python-requests.org/en/latest/
import csv # To create the output csv file
import unicodedata # To work with the string encoding of the data
entries = []
entry = []
urlnumber = 1 # Give the page number to start with
while urlnumber<100: # Give the page number to end with
#print type(urlnumber), urlnumber
url = 'http://www.glassdoor.com/p%d' % (urlnumber,) #Give the url of the forum, excluding the page number in the hyperlink
#print url
try:
r = requests.get(url, timeout = 10) #Sending a request to access the page
except Exception,e:
print e.message
break
if r.status_code == 200:
data = r.text
else:
print str(r.status_code) + " " + url
soup = BeautifulSoup(data) # Getting the page source into the soup
for div in soup.find_all('div'):
entry = []
if(div.get('class') != None and div.get('class')[0] == 'Comment'): # A single post is referred to as a comment. Each comment is a block denoted in a div tag which has a class called comment.
ps = div.find_all('p') #gets all the tags called p to a variable ps
aas = div.find_all('a') # gets all the tags called a to a variable aas
spans = div.find_all('span') #
times = div.find_all('time') # used to extract the time tage which gives the iDate of the post
concat_str = ''
for str in aas[1].contents: # prints the contents that is between the tag start and end
if str != "<br>" or str != "<br/>": # This denotes breaks in post which we need to work around.
concat_str = (concat_str + ' '+ str.encode('iso-8859-1')).strip() # The encoding is because the format exracted is a unicode. We need a uniform structure to work with the strings.
entry.append(concat_str)
concat_str = ''
for str in times[0].contents:
if str != "<br>" or str != "<br/>":
concat_str = (concat_str + ' '+ str.encode('iso-8859-1')).strip()
entry.append(concat_str)
#print "-------------------------"
for div in div.find_all('div'):
if (div.get('class') != None and div.get('class')[0] == 'Message'): # Extracting the div tag witht the class attribute as message.
blockqoutes = []
x = div.get_text()
for bl in div.find_all('blockquote'):
blockqoutes.append(bl.get_text()) #Block quote is used to get the quote made by a person. get text helps to elimiate the hyperlinks and pulls out only the data.
bl.decompose()
entry.append(div.get_text().replace("\n"," ").replace("<br/>","").encode('ascii','replace').encode('iso-8859-1'))
for bl in blockqoutes:
#print bl
entry.append(bl.replace("\n"," ").replace("<br/>","").encode('ascii','replace').encode('iso-8859-1'))
#print entry
entries.append(entry)
urlnumber = urlnumber + 1 # increment so that we can extract the next page
with open('gd1.csv', 'w') as output:
writer = csv.writer(output, delimiter= ',', lineterminator = '\n')
writer.writerows(entries)
print "Wrote to gd1.csv"
I fixed some errors in your script, but I guess that it doesn't print anything because you only get 405(!)-pages!
Also your previous try/catch-block didn't print the error message. Was it on purpose?

How to get all groups that specific user is member of - python, Active Directory

I'm trying to set filter to get all groups that specific user is member of.
I'm using Python, Currently
import traceback
import ldap
try:
l = ldap.open("192.168.1.1")
.
.
.
l.simple_bind_s(username, password)
#######################################################################
f_filterStr = '(objectclass=group)' # Would like to modify this, so I'll not have to make the next loop ...
#######################################################################
# the next command take some seconds
results = l.search_s(dn_recs, ldap.SCOPE_SUBTREE, f_filterStr)
for i in results:
if dict == type(i[1]):
group_name = i[1].get('name')
if list == type(group_name):
group_name = group_name[0];
search_str = "CN=%s," % username_bare
if -1 != ("%s" % i[1].get('member')).find (search_str):
print "User belong to this group! %s" % group_name
except Exception,e :
pass # handle as you wish
I think you are making this much too hard.
No python expert, but you can easily query Microsoft Active Directory for all groups a user is a member of using a filter like:
(member:1.2.840.113556.1.4.1941:=(CN=UserName,CN=Users,DC=YOURDOMAIN,DC=NET))\
-jim