(Continued from a previous question)
I am trying to deploy a google dataflow job to run it as a cron job on the google app engine, following the method described here.
I have an DataFlow script (written in python) in a pipelines/script.py folder. Running this script locally (using the Apache Beam DirectRunner) or on google cloud (using the DataFlowRunner) works properly. But when deploying the job to run it periodically on the app engine, the job raises the following error when executed:
(4cb822d7f796239a): Traceback (most recent call last): File
"/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py",
line 582, in do_work
work_executor.execute() File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/executor.py",
line 166, in execute
op.start() File "apache_beam/runners/worker/operations.py", line 294, in apache_beam.runners.worker.operations.DoOperation.start
(apache_beam/runners/worker/operations.c:10607)
def start(self): File "apache_beam/runners/worker/operations.py", line 295, in
apache_beam.runners.worker.operations.DoOperation.start
(apache_beam/runners/worker/operations.c:10501)
with self.scoped_start_state: File "apache_beam/runners/worker/operations.py", line 300, in
apache_beam.runners.worker.operations.DoOperation.start
(apache_beam/runners/worker/operations.c:9702)
pickler.loads(self.spec.serialized_fn)) File "/usr/local/lib/python2.7/dist-
packages/apache_beam/internal/pickler.py", line 225, in loads
return dill.loads(s) File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 277, in
loads
return load(file) File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 266, in
load
obj = pik.load() File "/usr/lib/python2.7/pickle.py", line 858, in load
dispatch[key](self) File "/usr/lib/python2.7/pickle.py", line 1090, in load_global
klass = self.find_class(module, name) File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 423, in
find_class
return StockUnpickler.find_class(self, module, name) File "/usr/lib/python2.7/pickle.py", line 1124, in find_class
__import__(module) ImportError: No module named pipelines.spanner_backup
This is the stack trace visible when directly accessing the job in the dataflow panel of the google cloud console. However, if I click on "Stack Traces" to see the error stack trace from the "Stackdriver Error Reporting" panel, I see the following trace:
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py", line 738, in run
work, execution_context, env=self.environment)
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/workitem.py", line 130, in get_work_items
work_item_proto.sourceOperationTask.split)
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/workercustomsources.py", line 142, in __init__
source_spec[names.SERIALIZED_SOURCE_KEY]['value'])
File "/usr/local/lib/python2.7/dist-packages/apache_beam/internal/pickler.py", line 225, in loads
return dill.loads(s)
File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 277, in loads
return load(file)
File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 266, in load
obj = pik.load()
File "/usr/lib/python2.7/pickle.py", line 858, in load
dispatch[key](self)
File "/usr/lib/python2.7/pickle.py", line 1090, in load_global
klass = self.find_class(module, name)
File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 423, in find_class
return StockUnpickler.find_class(self, module, name)
File "/usr/lib/python2.7/pickle.py", line 1124, in find_class
__import__(module)
ImportError: No module named spanner.client
Suggesting some import error when sharing things between workers? Google Spanner should be properly installed though.
I am using:
Flask==0.12.2
apache-beam[gcp]==2.1.1
gunicorn==19.7.1
gevent==1.2.1
google-cloud-dataflow==2.1.1
google-cloud-spanner==0.26
Am I missing something ?
Edit: My setup.py is the following: (as described here, corresponding github link with comments here)
from distutils.command.build import build as _build
import subprocess
import setuptools
class build(_build): # pylint: disable=invalid-name
sub_commands = _build.sub_commands + [('CustomCommands', None)]
CUSTOM_COMMANDS = [
['echo', 'Custom command worked!']]
class CustomCommands(setuptools.Command):
"""A setuptools Command class able to run arbitrary commands."""
def initialize_options(self):
pass
def finalize_options(self):
pass
def RunCustomCommand(self, command_list):
print 'Running command: %s' % command_list
p = subprocess.Popen(
command_list,
stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
# Can use communicate(input='y\n'.encode()) if the command run requires
# some confirmation.
stdout_data, _ = p.communicate()
print 'Command output: %s' % stdout_data
if p.returncode != 0:
raise RuntimeError(
'Command %s failed: exit code: %s' % (command_list, p.returncode))
def run(self):
for command in CUSTOM_COMMANDS:
self.RunCustomCommand(command)
REQUIRED_PACKAGES = ["Flask==0.12.2",
"apache-beam[gcp]==2.1.1",
"gunicorn==19.7.1",
"gevent==1.2.1",
"google-cloud-dataflow==2.1.1",
"google-cloud-spanner==0.26"
]
setuptools.setup(
name='dataflow_python_pipeline',
version='1.0.0',
description='DataFlow Python Pipeline',
install_requires=REQUIRED_PACKAGES,
packages=setuptools.find_packages(),
cmdclass={
'build': build,
'CustomCommands': CustomCommands,
}
)