Skip to content

Commit

Permalink
change to package
Browse files Browse the repository at this point in the history
abalon is now a package
  • Loading branch information
Tagar committed Nov 3, 2017
1 parent 90ca58a commit e82a845
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 3 deletions.
3 changes: 3 additions & 0 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

__all__ = ["pyspark"]

Empty file added abalon/__init__.py
Empty file.
29 changes: 26 additions & 3 deletions pyspark.py → abalon/pyspark.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,31 @@


###########################################################################################################

spark = None
debug = False

def pyspark_init (i_spark, i_debug):
'''
Initialize module-level variables
:param i_spark: an object of pyspark.sql.session.SparkSession
:param i_debug: debug output of the below functions?
'''

from pyspark.sql.session import SparkSession
if not isinstance(spark, pyspark.sql.session.SparkSession):
raise TypeError("spark parameter should be of type SparkSession")

global spark, debug
(spark, debug) = (i_spark, i_debug)


###########################################################################################################


def file_to_df (df_name, file_path, header=True, delimiter='|', inferSchema=True, cache=False):

"""
Reads in a delimited file and sets up a Spark dataframe
Expand Down Expand Up @@ -32,6 +53,7 @@ def file_to_df (df_name, file_path, header=True, delimiter='|', inferSchema=True


def sql_to_df (df_name, sql, cache=False):

"""
Runs an sql query and sets up a Spark dataframe
Expand All @@ -51,6 +73,7 @@ def sql_to_df (df_name, sql, cache=False):

###########################################################################################################


def copyMerge (src_dir, dst_file, overwrite=False, deleteSource=False):

"""
Expand Down Expand Up @@ -87,10 +110,10 @@ def debug_print (message):

try:
# loop over files in alphabetical order and append them one by one to the target file
for file in files:
debug_print("Appending file {} into {}".format(file, dst_file))
for filename in files:
debug_print("Appending file {} into {}".format(filename, dst_file))

in_stream = fs.open(file) # InputStream object
in_stream = fs.open(filename) # InputStream object
try:
hadoop.io.IOUtils.copyBytes(in_stream, out_stream, conf, False) # False means don't close out_stream
finally:
Expand Down
18 changes: 18 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

from setuptools import setup, find_packages

# http://setuptools.readthedocs.io/en/latest/setuptools.html

setup(name='abalon',
version='1.0',
packages=find_packages(),

install_requires=['docutils>=0.3'],

# metadata for upload to PyPI
description='Various utility functions for Apache Spark (pySpark)',
url='https://github.com/Tagar/abalon',
author='Ruslan Dautkhanov',
author_email='Dautkhanov@gmail.com',
license='Apache-2.0',
)

0 comments on commit e82a845

Please sign in to comment.