I'm not able to run python pipeline through airflow BeamRunPythonPipelineOperator. Below is my complete code:
DAG FILE
import os
from datetime import datetime, timedelta
from airflow.utils.dates import days_ago
from airflow import DAG
from airflow.providers.google.cloud.operators.dataflow import DataflowConfiguration
from airflow.providers.apache.beam.operators.beam import BeamRunPythonPipelineOperator
from airflow.providers.google.cloud.operators.dataflow import DataflowTemplatedJobStartOperator
default_args = {
"owner": "<...>",
"start_date": days_ago(1),
'dataflow_default_options': {
"project": "<...>",
}
}
dag = DAG(
dag_id="word_count",
default_args=default_args,
schedule_interval="#once"
)
start_python_pipeline_dataflow_runner = BeamRunPythonPipelineOperator(
task_id="start_python_pipeline_dataflow_runner",
runner="DataflowRunner",
py_file="gs://<...>/word_count.py",
pipeline_options={
'input':"gs://<...>/kinglear.txt",
'output':"gs://<...>/output.txt",
'temp_location':"gs://<...>/temp/",
'staging_location':"gs://<...>/temp/",
},
py_options=[],
py_requirements=['apache-beam[gcp]==2.26.0'],
py_interpreter='python3',
py_system_site_packages=False,
dataflow_config=DataflowConfiguration(
job_name='{{task.task_id}}', project_id="<...>", location="us-central1"
),
dag=dag,
)
Python File (word_count.py)
"""A word-counting workflow."""
# pytype: skip-file
import argparse
import logging
import re
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class WordExtractingDoFn(beam.DoFn):
"""Parse each line of input text into words."""
def process(self, element):
"""Returns an iterator over the words of this element.
The element is a line of text. If the line is blank, note that, too.
Args:
element: the element being processed
Returns:
The processed element.
"""
return re.findall(r'[\w\']+', element, re.UNICODE)
def run(argv=None, save_main_session=True):
"""Main entry point; defines and runs the wordcount pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
default='gs://<...>/kinglear.txt',
help='Input file to process.')
parser.add_argument(
'--output',
dest='output',
default='gs://<...>/output.txt',
help='Output file to write results to.')
argv = [
'--project=<...>',
'--region=us-central1',
'--runner=DataflowRunner',
'--staging_location=gs://<...>/temp/',
'--temp_location=gs://<...>/temp/',
'--template_location=gs://<...>/templates/word_count_template'
]
known_args, pipeline_args = parser.parse_known_args(argv)
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
# The pipeline will be run on exiting the with block.
with beam.Pipeline(argv=argv,options=pipeline_options) as p:
# Read the text file[pattern] into a PCollection.
lines = p | 'Read' >> ReadFromText(known_args.input)
counts = (
lines
| 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
| 'PairWIthOne' >> beam.Map(lambda x: (x, 1))
| 'GroupAndSum' >> beam.CombinePerKey(sum))
# Format the counts into a PCollection of strings.
def format_result(word, count):
return '%s: %d' % (word, count)
output = counts | 'Format' >> beam.MapTuple(format_result)
# Write the output using a "Write" transform that has side effects.
# pylint: disable=expression-not-assigned
output | 'Write' >> WriteToText(known_args.output)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
Below is the screenshot of the composer:
I am not able to see the dataflow job in the console, as well as count, result in the bucket. Could anyone suggest to me the right approach or any suggestions on this?
You DAG is ok, the problem is on the Beam Python file, there is an error when you send the Dataflow args in the argv. The best approach is extend pipeline_args. And the job is not being submitted because you are sending the argv in the beam.Pipeline.
Following is the fixed code:
word_count.py :
"""A word-counting workflow."""
# pytype: skip-file
import argparse
import logging
import re
import os
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class WordExtractingDoFn(beam.DoFn):
"""Parse each line of input text into words."""
def process(self, element):
"""Returns an iterator over the words of this element.
The element is a line of text. If the line is blank, note that, too.
Args:
element: the element being processed
Returns:
The processed element.
"""
return re.findall(r'[\w\']+', element, re.UNICODE)
def run(argv=None, save_main_session=True):
"""Main entry point; defines and runs the wordcount pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
default='gs://dataflow-samples/shakespeare/kinglear.txt',
help='Input file to process.')
parser.add_argument(
'--output',
dest='output',
default='gs://<bucket>/newoutput',
help='Output file to write results to.')
#argv = [
# '--project=<...>',
# '--region=us-central1',
# '--runner=DataflowRunner',
# '--staging_location=gs://<...>/temp/',
# '--temp_location=gs://<...>/temp/',
# '--template_location=gs://<...>/templates/word_count_template'
# ]
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_args.extend([
'--runner=DataflowRunner',
'--project=<project-name>',
'--region=<region>',
'--staging_location=gs://<bucket>/',
'--temp_location=gs://<bucket>/temp',
'--job_name=your-wordcount-job',
])
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
# The pipeline will be run on exiting the with block.
with beam.Pipeline(options=pipeline_options) as p:
lines = p | 'Read' >> ReadFromText(known_args.input)
counts = (
lines
| 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
| 'PairWIthOne' >> beam.Map(lambda x: (x, 1))
| 'GroupAndSum' >> beam.CombinePerKey(sum))
# Format the counts into a PCollection of strings.
def format_result(word, count):
return '%s: %d' % (word, count)
output = counts | 'Format' >> beam.MapTuple(format_result)
# Write the output using a "Write" transform that has side effects.
# pylint: disable=expression-not-assigned
output | 'Write' >> WriteToText(known_args.output)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
Related
PYTHON
This is the Python code, We are streaming our large file from google cloud storage to cloud run. In this code the large csv file is splitting into chunks and those chunks are going into in_memory_file whenever we got the first chunks it should immediately start streaming it and the remaining chunks will stream in response but we are not able to stream those chunks from in_memory_file
import os
import signal
import sys
import json
import pandas as pd
import structlog
import time
import threading
import io
import csv
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import ThreadPoolExecutor
from types import FrameType
from io import BytesIO, StringIO
from google.cloud import storage
from google.oauth2 import service_account
from flask import Flask, Response, request
app = Flask(__name__)
# SigTerm Log
def getJSONLogger() -> structlog._config.BoundLoggerLazyProxy:
structlog.configure(
processors=[`enter code here`
structlog.stdlib.add_log_level,
structlog.stdlib.PositionalArgumentsFormatter(),
structlog.processors.TimeStamper("iso"),
structlog.processors.JSONRenderer(),
],
wrapper_class=structlog.stdlib.BoundLogger,
)
return structlog.get_logger()
logger = getJSONLogger()
# SigTerm Handler
def shutdown_handler(signal: int, frame: FrameType) -> None:
logger.info("Signal received, safely shutting down.")
print("Exiting process.", flush=True)
sys.exit(0)
signal.signal(signal.SIGTERM, shutdown_handler)
# Split files to chunks
def split_byte_size(size: int, uri: str, bucket: str, key: str) -> list:
byte_list = []
chunks = 50
start = 0
for i in range(size, size * chunks + 1, size):
stop = i // chunks
byte_list.append({"uri": uri, "start": start, "end": stop, "bucket": bucket, "key": key})
start = stop + 1
return byte_list
#Cloud Storage connection
project = 'XYZ'
service_account_credentials_path = 'key.json'
credentials = service_account.Credentials.from_service_account_file(service_account_credentials_path)
storage_client = storage.Client(project=project, credentials=credentials)
# Download objects as chunks
def downloader(input: dict) -> object:
bucket_object = storage_client.get_bucket(bucket_or_name=input["bucket"])
blob = bucket_object.blob(input["key"])
in_memory_file = io.BytesIO()
blob.download_to_file(in_memory_file, start=input['start'], end=input['end'])
#print("Chunk " + str(input['start']) + " to " + str(input['end']) + "completed")
return in_memory_file
#app.route("/chunk_data")
def chunk_data():
bucket_name = 'cloudrundemofile'
source_blob_name = 'demofile.csv'
bucket_object = storage_client.get_bucket(bucket_name)
blob = bucket_object.get_blob(source_blob_name)
split_bytes = split_byte_size(blob.size, project, bucket_name, source_blob_name)
print(split_bytes)
#Async Thread
with ThreadPoolExecutor(max_workers=5) as ex:
results = ex.map(downloader, split_bytes)
resp = Response(results, 206, mimetype='text/csv')
#resp.headers.add('Content-Range', 'bytes {0}-{1}/{2}'.format(start, start + length - 1, file_size))
return resp
#return "Success"
if __name__ == "__main__":
signal.signal(signal.SIGINT, shutdown_handler)
app.run(host="0.0.0.0", port=8080)
else:
signal.signal(signal.SIGTERM, shutdown_handler)**
Hello all i am working with airflow here is scenario i am trying to resolve
i want to create DAG dynamically after the function run
try:
import os
import sys
from datetime import timedelta,datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.email_operator import EmailOperator
from airflow.utils.trigger_rule import TriggerRule
from airflow.utils.task_group import TaskGroup
import pandas as pd
print("All Dag modules are ok ......")
except Exception as e:
print("Error {} ".format(e))
# ===============================================
default_args = {
"owner": "airflow",
"start_date": datetime(2021, 1, 1),
"retries": 1,
"retry_delay": timedelta(minutes=1),
'email': ['shahsoumil519#gmail.com'],
'email_on_failure': True,
'email_on_retry': False,
}
dag = DAG(dag_id="project", schedule_interval="#once", default_args=default_args, catchup=False)
# ================================================
class XcomHelper(object):
def __init__(self, **context):
self.context = context
def get(self, key=None):
""" Get the Value from XCOM"""
try:
return self.context.get("ti").xcom_pull(key=key)
except Exception as e: return "Error"
def push(self, key=None, value=None):
"""Push the value on session """
try:
self.context['ti'].xcom_push(key=key, value=value)
return True
except Exception as e: return False
def create_dag(dag_id,schedule,dag_number,default_args):
def hello_world_py():
print('Hello World')
dag = DAG(dag_id, schedule_interval=schedule, default_args=default_args)
with dag:
t1 = PythonOperator(task_id=dag_id,python_callable=hello_world_py)
return dag
def simple_task(**context):
DATA = ["soumil", "Shah"]
for n in range(1, len(DATA)):
try:
dag_id = 'hello_world_{}'.format(str(n))
print("DAG ID : {} ".format(dag_id))
default_args = {'owner': 'airflow','start_date': datetime(2018, 1, 1)}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id,schedule, dag_number,default_args)
except Exception as e:
print("Error : {} ".format(e))
with DAG(dag_id="project", schedule_interval="#once", default_args=default_args, catchup=False) as dag:
simple_task = PythonOperator(task_id="simple_task",
python_callable=simple_task,
provide_context=True)
simple_task
I want to create these dags based on len of DATA variable
that data comes from the database
i tried looking into
https://www.astronomer.io/guides/dynamically-generating-dags
Can an Airflow task dynamically generate a DAG at runtime?
https://medium.com/#flavio.mtps/making-use-of-python-globals-to-dynamically-create-airflow-dags-124e556b704e
any help would be great
Revised Code :
try:
import os
import sys
from datetime import timedelta, datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
# from airflow.operators.email_operator import EmailOperator
# from airflow.utils.trigger_rule import TriggerRule
# from airflow.utils.task_group import TaskGroup
# import pandas as pd
print("All Dag modules are ok ......")
except Exception as e:
print("Error {} ".format(e))
def create_dag(dag_id, schedule, dag_number, default_args):
def hello_world_py():
print('Hello World')
dag = DAG(dag_id, schedule_interval=schedule, default_args=default_args)
with dag:
t1 = PythonOperator(task_id=dag_id, python_callable=hello_world_py)
return dag
def simple_task():
DATA = ["soumil", "Shah", "Shah2"]
for n in range(0, len(DATA)):
try:
dag_id = 'hello_world_{}'.format(str(n))
print("DAG ID : {} ".format(dag_id))
default_args = {'owner': 'airflow', 'start_date': datetime(2018, 1, 1)}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id, schedule, dag_number, default_args)
except Exception as e:
print("Error : {} ".format(e))
def trigger_function():
print("HEREE")
simple_task()
with DAG(dag_id="project", schedule_interval="#once", default_args={'owner': 'airflow', 'start_date': datetime(2018, 1, 1)}, catchup=False) as dag:
trigger_function = PythonOperator(task_id="trigger_function",python_callable=trigger_function,provide_context=True,)
trigger_function
I removed a few lines from your code to keep the answer to the point. The below code will generate DAGs like hello_world_0, hello_world_1... based on the contents of DATA .
EDIT - I used airflow v1.10.x but the code should work for v2.x
Suggestions:
Make the tasks names different from DAGs names.
dag_number variable is currently not being used. That can be taken off.
The DAGs will look like this -
try:
import os
import sys
from datetime import timedelta, datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
# from airflow.operators.email_operator import EmailOperator
# from airflow.utils.trigger_rule import TriggerRule
# from airflow.utils.task_group import TaskGroup
# import pandas as pd
print("All Dag modules are ok ......")
except Exception as e:
print("Error {} ".format(e))
def create_dag(dag_id, schedule, dag_number, default_args):
def hello_world_py():
print('Hello World')
dag = DAG(dag_id, schedule_interval=schedule, default_args=default_args)
with dag:
t1 = PythonOperator(task_id=dag_id, python_callable=hello_world_py)
return dag
def simple_task():
DATA = ["soumil", "Shah", "Shah2"]
for n in range(0, len(DATA)):
try:
dag_id = 'hello_world_{}'.format(str(n))
print("DAG ID : {} ".format(dag_id))
default_args = {'owner': 'airflow', 'start_date': datetime(2018, 1, 1)}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id, schedule, dag_number, default_args)
except Exception as e:
print("Error : {} ".format(e))
simple_task()
I am trying to insert data in bigquery from a file using dataflow pipeline. Below code gives me access denied error.
Also I have set application credentials by environment variable. I started getting error which says
The Application Default Credentials are not available.
I need help to resolve this. Thanks in advance. Please find below code:
from __future__ import absolute_import
import argparse
import logging
import re
import apache_beam as beam
import os
from apache_beam.options.pipeline_options import PipelineOptions
class DataIngestion:
def parse_method(self, string_input):
values = re.split(",",
re.sub('\r\n', '', re.sub(u'"', '', string_input)))
row = dict(
zip(('state', 'gender', 'year', 'name', 'number', 'created_date'),
values))
return row
def run(argv=None):
"""The main function which creates the pipeline and runs it."""
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "LocalPath\FileName.json"
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
required=False,
help='Input file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
default='gs://python-dataflow-example/data_files/head_usa_names.csv')
parser.add_argument('--output',
dest='output',
required=False,
help='Output BQ table to write results to.',
default='lake.usa_names')
# Parse arguments from the command line.
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_args.extend([
'--runner=DirectRunner',
'--project=projectID',
'--staging_location=staging_location',
'--temp_location=temp_location',
])
data_ingestion = DataIngestion()
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
(p
| 'Read from a File' >> beam.io.ReadFromText(known_args.input, skip_header_lines=1)
| 'String To BigQuery Row' >>
beam.Map(lambda s: data_ingestion.parse_method(s))
| 'Write to BigQuery' >> beam.io.Write(
beam.io.BigQuerySink(
known_args.output,
schema='state:STRING,gender:STRING,year:STRING,name:STRING,'
'number:STRING,created_date:STRING',
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
p.run().wait_until_finish()
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
You can remove the code to set the credentials inside your beam job and instead try the below command before running the job
gcloud auth application-default login
The above command will set the GOOGLE_APPLICATION_CREDENTIALS that will be visible to your job.
I'm trying to write an AWS Lambda service using Python 2.7 that will generate an In-Memory CSV file and email it as an attachment. I feel like I'm close with this script based on what I've learned but I'm not quite there.
# Import smtplib for the actual sending function
import smtplib
import sys
import csv
import cStringIO
from os.path import basename
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.application import MIMEApplication
# Import the email modules we'll need
server = smtplib.SMTP('smtp.postmarkapp.com', 587)
server.starttls()
server.login('.....','.....')
list = []
row1 = ["One","Two","Three"]
list.append(row1)
msg = MIMEMultipart()
msg['To'] = "daniel#mydomain.com"
msg['From'] = "noreply#mydomain.com"
msg['Subject'] = "DG Test subject"
msg.attach(MIMEText("Test Message"))
csv_buffer = cStringIO.StringIO()
writer = csv.writer(csv_buffer, lineterminator='\n')
writer.writerow(["1","2","3"])
for row in list:
writer.writerow(row)
print(csv_buffer.getvalue())
msg.attach(csv_buffer)
try:
response = server.sendmail(msg['From'], ["daniel#mydomain.com"],msg.as_string())
server.quit()
except AttributeError as error:
print(error)
else:
print(response)
This gives me the following error:
1,2,3
One,Two,Three
'cStringIO.StringO' object has no attribute 'get_content_maintype'
Basically it comes down to not being sure how to use the csv_buffer object. Assuming I just need to add that attribute to the object somehow but I'm not quite sure how. If I try to add any additional arguments to the .attach() line, it complains that I have too many arguments.
Thanks!
I figured it out, thanks to stitching together a few SO posts.
import cStringIO
import csv
csv_buffer = cStringIO.StringIO()
writer = csv.writer(csv_buffer, delimiter=',', quoting=csv.QUOTE_ALL)
writer.writerow(["1","2","3"])
for row in list:
writer.writerow(row)
print(csv_buffer.getvalue())
# new lines
csv_file = MIMEText(csv_buffer.getvalue())
attachment = csv_file.add_header('Content-Disposition', 'attachment', filename="csv_file.csv")
msg.attach(csv_file)
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import random
from io import open
from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
import logging
from deepwalk import graph
from deepwalk import walks as serialized_walks
from walks import WalksCorpus
from gensim.models import Word2Vec
from deepwalk.skipgram import Skipgram
from six import text_type as unicode
from six import iteritems
from six.moves import range
import psutil
from multiprocessing import cpu_count
p = psutil.Process(os.getpid())
try:
p.set_cpu_affinity(list(range(cpu_count())))
except AttributeError:
try:
p.cpu_affinity(list(range(cpu_count())))
except AttributeError:
pass
logger = logging.getLogger(__name__)
LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
def debug(type_, value, tb):
if hasattr(sys, 'ps1') or not sys.stderr.isatty():
sys.__excepthook__(type_, value, tb)
else:
import traceback
import pdb
traceback.print_exception(type_, value, tb)
print(u"\n")
pdb.pm()
def process(args):
if args.format == "adjlist":
G = graph.load_adjacencylist(args.input, undirected=args.undirected)
elif args.format == "edgelist":
G = graph.load_edgelist(args.input, undirected=args.undirected)
elif args.format == "mat":
G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
else:
raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)
print("Number of nodes: {}".format(len(G.nodes())))
num_walks = len(G.nodes()) * args.number_walks
print("Number of walks: {}".format(num_walks))
data_size = num_walks * args.walk_length
print("Data size (walks*length): {}".format(data_size))
if data_size < args.max_memory_data_size:
print("Walking...")
walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
print("Training...")
model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers)
else:
print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size))
print("Walking...")
walks_filebase = args.output + ".walks"
walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
num_workers=args.workers)
print("Counting vertex frequency...")
if not args.vertex_freq_degree:
vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
else:
# use degree distribution for frequency in tree
vertex_counts = G.degree(nodes=G.iterkeys())
print("Training...")
walks_corpus = serialized_walks.WalksCorpus(walk_files)
model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
size=args.representation_size,
window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)
model.wv.save_word2vec_format(args.output)
def main():
parser = ArgumentParser("deepwalk",
formatter_class=ArgumentDefaultsHelpFormatter,
conflict_handler='resolve')
parser.add_argument("--debug", dest="debug", action='store_true', default=False,
help="drop a debugger if an exception is raised.")
parser.add_argument('--format', default='adjlist',
help='File format of input file')
parser.add_argument('--input', nargs='?', required=True,
help='Input graph file')
parser.add_argument("-l", "--log", dest="log", default="INFO",
help="log verbosity level")
parser.add_argument('--matfile-variable-name', default='network',
help='variable name of adjacency matrix inside a .mat file.')
parser.add_argument('--max-memory-data-size', default=1000000000, type=int,
help='Size to start dumping walks to disk, instead of keeping them in memory.')
parser.add_argument('--number-walks', default=10, type=int,
help='Number of random walks to start at each node')
parser.add_argument('--output', required=True,
help='Output representation file')
parser.add_argument('--representation-size', default=64, type=int,
help='Number of latent dimensions to learn for each node.')
parser.add_argument('--seed', default=0, type=int,
help='Seed for random walk generator.')
parser.add_argument('--undirected', default=True, type=bool,
help='Treat graph as undirected.')
parser.add_argument('--vertex-freq-degree', default=False, action='store_true',
help='Use vertex degree to estimate the frequency of nodes '
'in the random walks. This option is faster than '
'calculating the vocabulary.')
parser.add_argument('--walk-length', default=40, type=int,
help='Length of the random walk started at each node')
parser.add_argument('--window-size', default=5, type=int,
help='Window size of skipgram model.')
parser.add_argument('--workers', default=1, type=int,
help='Number of parallel processes.')
args = parser.parse_args()
numeric_level = getattr(logging, args.log.upper(), None)
logging.basicConfig(format=LOGFORMAT)
logger.setLevel(numeric_level)
if args.debug:
sys.excepthook = debug
process(args)
if __name__ == "__main__":
sys.exit(main())
Error:
Traceback (most recent call last): File "main.py", line 165, in sys.exit(main()) File "main.py", line 162, in main process(args) File "main.py", line 93, in process walks_corpus = serialized_walks.WalksCorpus(walk_files) AttributeError: 'module' object has no attribute 'WalksCorpus'
Why do I get this error?
It looks as though you are importing WalksCorpus on its own from walks with from walks import WalksCorpus. Then when you try to use WalksCorpus method you are looking for it with in serialized_walks which I assume does not have the WalksCorpus method in it.
Try changing this line.
walks_corpus = serialized_walks.WalksCorpus(walk_files)
To:
walks_corpus = WalksCorpus(walk_files)