Need help to read a file in python - python-2.7

I am trying to read a file and file contains various kinds of data.
The example of the file type is given below.
[CIRCUIT1]
CIRCUITNAME=CIRCUIT1
00.12 12/20 2.3 23.6
00.12 12/20 2.3 23.6
00.42 12/20 2.2 23.3
00.42 12/20 2.2 23.3
[CIRCUIT2]
CIRCUITNAME=CIRCUIT2
00.12 12/20 2.2 26.7
00.12 12/20 2.2 26.7
00.42 12/20 2.2 26.5
00.42 12/20 2.2 26.5
00.42 12/20 2.2 26.5
[AMBIENT]
00.42 12/20 8.6
01.42 12/20 8.6
02.42 12/20 8.6
03.42 12/20 8.7
04.42 12/20 8.8
05.42 12/20 8.6
06.42 12/20 8.7
Now, I have defined a function which only returns the 3rd and 4th column of circuit1.
but date and time formats should be returned and will be defined later. But I'm getting index out of range error.
def load_ci(filepath):
fileObj=open(filepath, 'r')
time_1=[],time_2=[],t=0,ti=0,loadCurrent_1=[],surfaceTemp_1=[],loadCurrent_2=[],surfaceTemp_2=[],ambient=[]
read=0
for line in fileObj:
if not line.strip():
continue
if read==1:
if '[AMBIENT]' in line:
read=3
continue
elif 'CIRCUITNAME=CIRCUIT2' in line: read=2
else:
if line!='\n' and '[CIRCUIT2]' not in line:
point=line.split(' ')
t=(float(point[0]))
ti=int(t)*3600+(t-int(t))*60*100
time_1.append(ti)
loadCurrent_1.append(float(point[2]))
surfaceTemp_1.append(float(point[3]))
if read==2:
if '[AMBIENT]' in line:
read=3
continue
elif 'CIRCUITNAME=CIRCUIT2' in line: read=2
else:
if line!='\n' and '[CIRCUIT2]' not in line:
point=line.split(' ')
t=(float(point[0]))
ti=int(t)*3600+(t-int(t))*60*100
time_2.append(ti)
loadCurrent_2.append(float(point[2]))
surfaceTemp_2.append(float(point[3]))
if read==3:
if line!='\n':
point=line.split(' ')
ambient.append(float(point[2]))
if 'CIRCUITNAME=CIRCUIT1' in line: read=1
return np.array(loadCurrent_1),np.array(surfaceTemp_1),np.array(loadCurrent_2),np.array(surfaceTemp_2),np.array(ambient),np.array(time_1),np.array(time_2)

After you detect a line containing [AMBIENT], you need to advance to the next line, while changing your read state to 3. Add a continue statement after read = 3 at two points in your code where you check for [AMBIENT]
Additionally, change your code checking for [CIRCUIT2] from
if line != '\n' and line != '[CIRCUIT2]':
to
if line != '\n' and '[CIRCUIT2]' not in line:
If you want to disregard empty lines, you can add a check at the beginning of your loop like:
if not line.strip():
continue
I've reworked your code in the question to break out parsing circuit from ambient data, to simplify state management. I pass around the file object, utilizing its iteration state to keep track of where we are in the file at any given point. Sections of the file start with '[...]' and end with a blank line, so I can take advantage of that. I group all of the circuit data into a dictionary for convenience, but this could be rolled into a full fledged class as well if you wanted to.
import numpy as np
def parseCircuit(it, header):
loadCurrent, surfaceTemp, time = [], [], []
for line in it:
line = line.strip()
if not line:
break
elif line.startswith('CIRCUITNAME='):
name = line[12:]
else:
point=line.split(' ')
h, m = map(int, point[0].split('.'))
time.append(h * 3600 + m * 60)
loadCurrent.append(float(point[2]))
surfaceTemp.append(float(point[3]))
return {'name': name,
'surfaceTemp': np.array(surfaceTemp),
'loadCurrent': np.array(loadCurrent),
'time': np.array(time)}
def parseAmbient(it, header):
ambient = []
for line in it:
line = line.strip()
if not line:
break
point=line.split(' ')
ambient.append(float(point[2]))
return np.array(ambient)
def load_ci(filepath):
fileObj=open(filepath, 'r')
circuits = {}
ambient = None
for line in fileObj:
line = line.strip() # remove \n from end of line
if not line: # skip empty lines
continue
if line.startswith('[CIRCUIT'):
circuit = parseCircuit(fileObj, line)
circuits[circuit['name']] = circuit
elif line.startswith('[AMBIENT'):
ambient = parseAmbient(fileObj, line)
return circuits, ambient
print load_ci('test.ci')
outputs
({'CIRCUIT2': {'loadCurrent': array([ 2.2, 2.2, 2.2, 2.2, 2.2]), 'surfaceTemp': array([ 26.7, 26.7, 26.5, 26.5, 26.5]), 'name': 'CIRCUIT2', 'time': array([ 720, 720, 2520, 2520, 2520])}, 'CIRCUIT1': {'loadCurrent': array([ 2.3, 2.3, 2.2, 2.2]), 'surfaceTemp': array([ 23.6, 23.6, 23.3, 23.3]), 'name': 'CIRCUIT1', 'time': array([ 720, 720, 2520, 2520])}}, array([ 8.6, 8.6, 8.6, 8.7, 8.8, 8.6, 8.7]))

Related

Error with FuzzyWuzzy: StringProcessor.replace_non_letters_non_numbers_with_whitespace(s)

I cannot get the following function to run:
match, match_score = process.extractOne(score, pct_dict.keys())
I get a whitespace error I cannot seem to resolve. Any idea what is causing this?
What it should do: If the score is 15 it should return 0.026
Error:
Error: output = self.func(*resolved_args, **resolved_kwargs) wnas1
| File "/code/cleveland/templatetags/percentiles_ratings.py", line
32, in get_percentile_standard wnas1 | match, match_score =
process.extractOne(score, pct_dict.keys()) wnas1 | File
"/usr/local/lib/python3.7/site-packages/fuzzywuzzy/process.py", line
220, in extractOne wnas1 | return max(best_list, key=lambda
i: i[1]) wnas1 | File
"/usr/local/lib/python3.7/site-packages/fuzzywuzzy/process.py", line
78, in extractWithoutOrder wnas1 | processed_query =
processor(query) wnas1 | File
"/usr/local/lib/python3.7/site-packages/fuzzywuzzy/utils.py", line 95,
in full_process wnas1 | string_out =
StringProcessor.replace_non_letters_non_numbers_with_whitespace(s)
wnas1 | File
"/usr/local/lib/python3.7/site-packages/fuzzywuzzy/string_processing.py",
line 26, in replace_non_letters_non_numbers_with_whitespace wnas1
| return cls.regex.sub(" ", a_string)
Code:
from __future__ import unicode_literals
from django import template
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
register = template.Library()
#register.simple_tag
def get_perc(score):
MATCH_THRESHOLD = 80
pct_dict = {14: 0.016, 14.7: 0.021, 15.3: 0.026, 16: 0.034, 16.7: 0.04, 17.3: 0.05, 18: 0.07, 18.7: 0.09,
19.3: 0.11, 20: 0.13, 20.7: 0.17, 21.3: 0.21, 22: 0.26, 22.7: 0.31, 23.3: 0.38, 24: 0.47}
if not score:
return '--'
elif score < 26.7:
return '<1'
match, match_score = process.extractOne(score, pct_dict.keys())
if match_score >= MATCH_THRESHOLD:
return pct_dict[match]
else:
return '--'
As per fuzzywuzzy documentation, you need to compare between two strings. Meaning you need to convert you values in string to compare them. Then you need to do it like this:
match, match_score = process.extractOne(str(score), pct_dict.keys())
I would not recommend this approach because that will not be accurate.
>>> x = ['1','2','3']
>>> y='2'
>>> process.extractOne(y,x)
('2', 100)
>>> y='2.2'
>>> process.extractOne(y,x)
('2', 90)
>>> y = '2.9'
>>> process.extractOne(y,x)
('2', 90)
Here in last 2 entries, you will see score 90 for both 2.2 and 2.9, where 2.9 is much closer to 3.
As you have numbers and I would recommend you to do simply compare them like this:
value = min(pct_dict, key=lambda x:abs(x - score))
# then some logics to see if value is close to score or put some static threshold value like `abs(value-score) < .3`
There are few SO answers which might help you regarding this.
Thanks, that worked. Thanks for the clarification regarding string values. This allowed me to set str(score) to convert the score value to a string.
Here is the functioning code:
#register.simple_tag
def get_perc(score):
MATCH_THRESHOLD = 80
pct_dict = {'14': '0.016', '14.7': '0.021', '15.3': '0.026', '16': '0.034', '16.7': '0.04', '17.3': '0.05', '18': '0.07', '18.7': '0.09', '19.3': '0.11', '20': '0.13', '20.7': '0.17', '21.3': '0.21', '22': '0.26', '22.7': '0.31', '23.3': '0.38', '24': '0.47'}
if not score:
return '--'
elif score < 24:
return '<1'
match, match_score = process.extractOne(str(score), pct_dict.keys())
if match_score >= MATCH_THRESHOLD:
return pct_dict[match]
else:
return '--'

Force ylim range in subgraph

When plotting a serie of subgraphs with matplotlib, I can't set the ylim range properly.
Here's part of the code:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
(...) # loading npy data
titles = ["basestr1", "basestr2", "basestr3", "basestr4", "basestr5"]
labels = ["baselab1", "baselab2", "baselab3", "baselab4", "baselab5"]
linew = 2.24
ms = 10
mw = 2
fc = (1,1,1)
bc = (1,1,1)
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=[(1,0.4,0.4), (0.1,0.6,0.1), (0.04,0.2,0.04)])
mpl.rcParams.update({'font.size': 12})
fig2 = plt.subplots(2, 2, figsize=(12,9), facecolor=fc)
plt.rc('font', family='serif')
ax0 = plt.subplot(221)
ax1 = plt.subplot(222)
ax2 = plt.subplot(223)
ax3 = plt.subplot(224)
axl = [ax0, ax1, ax2, ax3]
em = []
fp = []
fn = []
gm = []
for c,element in enumerate(elements):
em.append([i[0] for i in element])
fp.append([i[1][1] if 1 in i[1] else 0 for i in element]) # red
fn.append([i[1][2] if 2 in i[1] else 0 for i in element]) # light green
gm.append([i[1][3] if 3 in i[1] else 0 for i in element]) # dark green
axl[c].semilogy(em[c], fp[c], "-x", lw=linew, markersize=ms, mew=mw) # red
axl[c].semilogy(em[c], fn[c], "-x", lw=linew, markersize=ms, mew=mw) # light green
axl[c].semilogy(em[c], gm[c], "-o", lw=linew, markersize=ms, mew=mw, mfc='None') # dark green
axl[c].set_ylim([-10, 200]) # <-- Here's the issue; it seems not to work properly.
axl[c].grid(True,which="both")
axl[c].set_title(titles[c])
axl[c].set_xlabel(labels[c])
axl[c].set_ylabel(r'Count')
plt.legend(['False', 'True', 'Others'], loc=3, bbox_to_anchor=(.62, 0.4), borderaxespad=0.)
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
plt.savefig('/home/username/Desktop/figure.png',
facecolor=fig2.get_facecolor(),edgecolor='w',orientation='landscape',papertype=None,
format=None, transparent=False, bbox_inches=None, pad_inches=0.1,
frameon=None)
plt.show() # block=False
Where elements is a list containing 4 arrays.
Each of these array looks like:
elements[0]
Out[16]:
array([[1, {0.0: 1252, 1.0: 11, 2.0: 170, 3.0: 11}],
[2, {0.0: 1251, 1.0: 12, 2.0: 163, 3.0: 18}],
[3, {0.0: 1229, 1.0: 34, 2.0: 148, 3.0: 33}],
...,
[6, {0.0: 1164, 1.0: 99, 2.0: 125, 3.0: 56}],
[7, {0.0: 1111, 1.0: 152, 2.0: 105, 3.0: 76}],
[8, {0.0: 1056, 1.0: 207, 2.0: 81, 3.0: 100}]], dtype=object)
Where am I wrong?
I can set any values I want in axl[c].set_ylim([-10, 200]) it doesn't change anything on the output graph.
Update:
Ok, it seems not possible to set other value as 1 as starting y-axis value here.

Reading Data from CSV and fill Empty Values Python

I am reading in a CSV file with the general schema of
,abv,ibu,id,name,style,brewery_id,ounces
14,0.061,60.0,1979,Bitter Bitch,American Pale Ale (APA),177,12.0
0 , 0.05,, 1436, Pub Beer, American Pale Lager, 408, 12.0
I am running into problems where fields are not existing such as in object 0 where it is lacking an IBU. I would like to be able to insert a value such as 0.0 that would work as a float for values that require floats and an empty string for ones that require strings.
My code is along the lines of
import csv
import numpy as np
def dataset(path, filter_field, filter_value):
with open(path, 'r') as csvfile:
reader = csv.DictReader(csvfile)
if filter_field:
for row in filter(lambda row: row[filter_field]==filter_value, reader):
yield row
def main(path):
data = [(row["ibu"], float(row["ibu"])) for row in dataset(path, "style", "American Pale Lager")]
As of right now my code would throw an error sine there are empty values in the "ibu" column for object 0.
How should one go about solving this problem?
You can do the following:
add a default dictionary input that you can use for missing values
and also to update upon certain conditions such as when ibu is empty
this is your implementation changed to correct for what you need. If I were you I would use pandas ...
import csv, copy
def dataset(path, filter_field, filter_value, default={'brewery_id':-1, 'style': 'unkown style', ' ': -1, 'name': 'unkown name', 'abi':0.0, 'id': -1, 'ounces':-1, 'ibu':0.0}):
with open(path, 'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if row is None:
break
if row[filter_field].strip() != filter_value:
continue
default_row = copy.copy(default)
default_row.update(row)
# you might want to add conditions
if default_row["ibu"] == "":
default_row["ibu"] = default["ibu"]
yield default_row
data = [(row["ibu"], float(row["ibu"])) for row in dataset('test.csv', "style", "American Pale Lager")]
print data
>> [(0.0, 0.0)]
Why don't you use
import pandas as pd
df = pd.read_csv(data_file)
The following is the result:
In [13]: df
Out[13]:
Unnamed: 0 abv ibu id name style \
0 14 0.061 60.0 1979 Bitter Bitch American Pale Ale (APA)
1 0 0.050 NaN 1436 Pub Beer American Pale Lager
brewery_id ounces
0 177 12.0
1 408 12.0
Simulating your file with a text string:
In [48]: txt=b""" ,abv,ibu,id,name,style,brewery_id,ounces
...: 14,0.061,60.0,1979,Bitter Bitch,American Pale Ale (APA),177,12.0
...: 0 , 0.05,, 1436, Pub Beer, American Pale Lager, 408, 12.0
...: """
I can load it with numpy genfromtxt.
In [49]: data=np.genfromtxt(txt.splitlines(),delimiter=',',dtype=None,skip_heade
...: r=1,filling_values=0)
In [50]: data
Out[50]:
array([ (14, 0.061, 60., 1979, b'Bitter Bitch', b'American Pale Ale (APA)', 177, 12.),
( 0, 0.05 , 0., 1436, b' Pub Beer', b' American Pale Lager', 408, 12.)],
dtype=[('f0', '<i4'), ('f1', '<f8'), ('f2', '<f8'), ('f3', '<i4'), ('f4', 'S12'), ('f5', 'S23'), ('f6', '<i4'), ('f7', '<f8')])
In [51]:
I had to skip the header line because it is incomplete (a blank for the 1st field). The result is a structured array - a mix of ints, floats and strings (bytestrings in Py3).
After correcting the header line, and using names=True, I get
array([ (14, 0.061, 60., 1979, b'Bitter Bitch', b'American Pale Ale (APA)', 177, 12.),
( 0, 0.05 , 0., 1436, b' Pub Beer', b' American Pale Lager', 408, 12.)],
dtype=[('f0', '<i4'), ('abv', '<f8'), ('ibu', '<f8'), ('id', '<i4'), ('name', 'S12'), ('style', 'S23'), ('brewery_id', '<i4'), ('ounces', '<f8')])
genfromtxt is the most powerful csv reader in numpy. See it's docs for more parameters. The pandas reader is faster and more flexible - but of course produces a data frame, not array.

Why I am getting the following AttributeError in Python?

I am using the sklearn's GradientBoostingRegression method. So after fitting it with 2000 estimators, I wanted to add more estimators to it. Since it is taking too long to rerun the entire fitting process, I used the set_params() method. Note that it is a multi-target problem, meaning, I have 3 targets to fit. So I am using the following code to add more estimators.
'''parameters: models (list of length 3 in our case )
train_X, train_y [n_samples x 3], test
n_estimators : previous + 500 (default) [additional estimators]
warm_start : True (default)
'''
def addMoreEstimators(train_X, train_y, test, models, n_estimators = 500, warm_start=True):
params = {'n_estimators':n_estimators, 'warm_start':warm_start}
gbm_pred= pd.DataFrame()
for (i,stars),clf in zip(enumerate(['*','**','***']), models):
clf.set_params(**params)
%time clf.fit(train_X.todense(),train_y[stars])
%time gbm_pred[stars] = clf.predict(test.todense())
gbm_pred = gbm_pred.as_matrix()
gbm_dict ={'model': gbm, 'prediction': gbm_pred}
return gbm_dict
Note: the models parameter is a list of 3 fitted models for the 3 targets.
When I ran it for the first time using 2500 (originally I had 2000 estimators), it ran fine and gave me an output.
When, I am running the same function using 3000 estimators, I am getting an AttributeError (see the traceback of the error below). Here the models contained the 3 fitted models. Below is the traceback of the error: (it's kinda long)
AttributeError Traceback (most recent call last)
<ipython-input-104-9418ada3b36f> in <module>()
7 test = val_X_tfidf[:,shortened_col_index],
8 models = models,
----> 9 n_estimators = 3000)
10
11 reduced_features_gbm_pred_3000_2_lr_1_msp_2 = reduced_features_gbm_model_3000_2_lr_1_msp_2['prediction']
<ipython-input-103-e15a4fb70b50> in addMoreEstimators(train_X, train_y, test, models, n_estimators, warm_start)
15
16 clf.set_params(**params)
---> 17 get_ipython().magic(u'time clf.fit(train_X.todense(),train_y[stars])')
18 print 'starting prediction'
19
//anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
2305 magic_name, _, magic_arg_s = arg_s.partition(' ')
2306 magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2307 return self.run_line_magic(magic_name, magic_arg_s)
2308
2309 #-------------------------------------------------------------------------
//anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
2226 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
2227 with self.builtin_trap:
-> 2228 result = fn(*args,**kwargs)
2229 return result
2230
//anaconda/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
//anaconda/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
191 # but it's overkill for just that one bit of state.
192 def magic_deco(arg):
--> 193 call = lambda f, *a, **k: f(*a, **k)
194
195 if callable(arg):
//anaconda/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
1160 if mode=='eval':
1161 st = clock2()
-> 1162 out = eval(code, glob, local_ns)
1163 end = clock2()
1164 else:
<timed eval> in <module>()
//anaconda/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.pyc in fit(self, X, y, sample_weight, monitor)
973 self.estimators_.shape[0]))
974 begin_at_stage = self.estimators_.shape[0]
--> 975 y_pred = self._decision_function(X)
976 self._resize_state()
977
//anaconda/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.pyc in _decision_function(self, X)
1080 # not doing input validation.
1081 score = self._init_decision_function(X)
-> 1082 predict_stages(self.estimators_, X, self.learning_rate, score)
1083 return score
1084
sklearn/ensemble/_gradient_boosting.pyx in sklearn.ensemble._gradient_boosting.predict_stages (sklearn/ensemble/_gradient_boosting.c:2502)()
AttributeError: 'int' object has no attribute 'tree_'
Sorry for the long traceback, but I think it wouldn't be possible to provide me with meaningful feedback.
Again, why am I getting this feedback ?
Any help would be greatly appreciated.
Thanks
Edit
Below is the code that generates the models that was one of the inputs in the above function.
from sklearn import ensemble
def updated_runGBM(train_X, train_y, test,
n_estimators =100,
max_depth = 1,
min_samples_split=1,
learning_rate=0.01,
loss= 'ls',
warm_start=True):
'''train_X : n_samples x m_features
train_y : n_samples x k_targets (multiple targets allowed)
test : n_samples x m_features
warm_start : True (originally the default is False, but I want to add trees)
'''
params = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split,
'learning_rate': learning_rate, 'loss': loss,'warm_start':warm_start}
gbm1 = ensemble.GradientBoostingRegressor(**params)
gbm2 = ensemble.GradientBoostingRegressor(**params)
gbm3 = ensemble.GradientBoostingRegressor(**params)
gbm = [gbm1,gbm2,gbm3]
gbm_pred= pd.DataFrame()
for (i,stars),clf in zip(enumerate(['*','**','***']), gbm):
%time clf.fit(train_X.todense(),train_y[stars])
%time gbm_pred[stars] = clf.predict(test.todense())
gbm_pred = gbm_pred.as_matrix()
gbm_pred = np.clip(gbm_pred,0,np.inf)
gbm_dict ={'model': gbm, 'prediction': gbm_pred}
return gbm_dict
NOTE In the code above, I have removed some of the print statements to reduce clutter.
These are the two functions I am using, nothing else (apart from the code to split up the data).

Opening a text file and then storing the contents into a nested dictionary in python 2.7

I'm fairly new to Python, and computing languages in general. I want to open a text file and then store its contents in a nested dictionary. Here's my code so far:
inputfile = open("Proj 4.txt", "r")
for line in inputfile:
line = line.strip()
print line
NGC = {}
inputfile.close()
I know I need to use the add operation for dictionaries I'm just unsure how to proceed. Here's a copy of the text file:
NGC0224
Name: Andromeda Galaxy
Messier: M31
Distance: 2900
Magnitude: 3.4
NGC6853
Name: Dumbbell Nebula
Messier: M27
Distance: 1.25
Magnitude: 7.4
NGC4826
Name: Black Eye Galaxy
Messier: M64
Distance: 19000
Magnitude: 8.5
NGC4254
Name: Coma Pinwheel Galaxy
Messier: M99
Distance: 60000
Brightness: 9.9 mag
NGC5457
Name: Pinwheel Galaxy
Messier: M101
Distance: 27000
Magnitude: 7.9
NGC4594
Name: Sombrero Galaxy
Messier: M104
Distance: 50000
with open(infilepath) as infile:
answer = {}
name = None
for line in infile:
line = line.strip()
if line.startswith("NGC"):
name = line
answer[name] = {}
else:
var, val = line.split(':', 1)
answer[name][var.strip()] = val.strip()
Output with your text file:
>>> with open(infilepath) as infile:
... answer = {}
... name = None
... for line in infile:
... line = line.strip()
... if line.startswith("NGC"):
... name = line
... answer[name] = {}
... else:
... var, val = line.split(':', 1)
... answer[name][var.strip()] = val.strip()
...
>>> answer
{'NGC6853': {'Messier': 'M27', 'Magnitude': '7.4', 'Distance': '1.25', 'Name': 'Dumbbell Nebula'}, 'NGC4254': {'Brightness': '9.9 mag', 'Messier': 'M99', 'Distance': '60000', 'Name': 'Coma Pinwheel Galaxy'}, 'NGC4594': {'Messier': 'M104', 'Distance': '50000', 'Name': 'Sombrero Galaxy'}, 'NGC0224': {'Messier': 'M31', 'Magnitude': '3.4', 'Distance': '2900', 'Name': 'Andromeda Galaxy'}, 'NGC4826': {'Messier': 'M64', 'Magnitude': '8.5', 'Distance': '19000', 'Name': 'Black Eye Galaxy'}, 'NGC5457': {'Messier': 'M101', 'Magnitude': '7.9', 'Distance': '27000', 'Name': 'Pinwheel Galaxy'}}
You have to define better how you want this data mapped to a dictionary. I you can change the file format, it would be nice to reformat it as a standard INI file. You could read it with the ConfigParser module.
But if you really want to go this way. Here is a quick and dirty solution:
d = {}
k = ''
for line in open('Proj 4.txt'):
if ':' in line:
key, value = line.split(':', 1)
d[k][key] = value.strip()
else:
k = line.strip()
d[k] = {}
The dict d has the parsed file.