2 files added
4 files modified
285 ■■■■■ changed files
exploration_notebooks/compare-brands.ipynb 2 ●●● patch | view | raw | blame | history
exploration_notebooks/explore-enrichment.ipynb 2 ●●● patch | view | raw | blame | history
exploration_notebooks/extract-audio-devices.ipynb 112 ●●●●● patch | view | raw | blame | history
exploration_notebooks/extract-cellphones.ipynb 2 ●●● patch | view | raw | blame | history
exploration_notebooks/jdbc-mysql-connection.ipynb 2 ●●● patch | view | raw | blame | history
process.py 165 ●●●●● patch | view | raw | blame | history
exploration_notebooks/compare-brands.ipynb
....@@ -34,7 +34,7 @@
3434 },
3535 "outputs": [],
3636 "source": [
37
- "spark = SparkSession.builder.appName(\"Exploration\").enableHiveSupport().getOrCreate()"
37
+ "spark = SparkSession.builder.appName(\"Compare Brand Names\").enableHiveSupport().getOrCreate()"
3838 ]
3939 },
4040 {
exploration_notebooks/explore-enrichment.ipynb
....@@ -23,7 +23,7 @@
2323 "source": [
2424 "spark = SparkSession \\\n",
2525 " .builder \\\n",
26
- " .appName(\"Enirchment\") \\\n",
26
+ " .appName(\"Explore Enrichment\") \\\n",
2727 " .enableHiveSupport() \\\n",
2828 " .getOrCreate()"
2929 ]
exploration_notebooks/extract-audio-devices.ipynb
....@@ -0,0 +1,112 @@
1
+{
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {
7
+ "collapsed": true
8
+ },
9
+ "outputs": [],
10
+ "source": [
11
+ "from pyspark.sql import SparkSession\n",
12
+ "from pyspark.sql.functions import regexp_extract, regexp_replace\n",
13
+ "import spark_jupyter"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 2,
19
+ "metadata": {
20
+ "collapsed": true
21
+ },
22
+ "outputs": [],
23
+ "source": [
24
+ "spark = SparkSession \\\n",
25
+ " .builder \\\n",
26
+ " .appName(\"Audio Enrichment\") \\\n",
27
+ " .enableHiveSupport() \\\n",
28
+ " .getOrCreate()"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 6,
34
+ "metadata": {
35
+ "collapsed": true
36
+ },
37
+ "outputs": [],
38
+ "source": [
39
+ "# df = spark.read.csv('../datasets/affilinet_products-telefonde_441_666756.txt', header=True, sep=';')\n",
40
+ "df = spark.read.csv('../datasets/affilinet_products-cyberport_341_666756.txt', header=True, sep=';')"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 14,
46
+ "metadata": {
47
+ "collapsed": false
48
+ },
49
+ "outputs": [],
50
+ "source": [
51
+ "df.select(\n",
52
+ " df.Title,\n",
53
+ " regexp_extract(df.Title, '(?i)(headphones|headset|earphones|earbuds|speaker)', 0).alias('device_family'),\n",
54
+ " regexp_extract(df.Title, '(?i)bluetooth', 0).alias('bluetooth'),\n",
55
+ " regexp_extract(df.Title, '(?i)((^|[^A-Za-z\\d])stereo[^A-Za-z\\d])', 0).alias('is_stereo'),\n",
56
+ " regexp_extract(df.Title, '(?i)(wifi|wi-fi|wi fi|wireless)', 0).alias('wifi'),\n",
57
+ " regexp_extract(df.Title, '(?i)(noise([ |-]cancelling)?([^A-Zaz-z\\d]|$))', 0).alias('noise_cancelling'),\n",
58
+ " regexp_extract(df.Title, '(?i)((mini|micro)*( |-)?usb( )?((\\d+)?(,|[.])?(\\d+))?)', 0).alias('usb_type'),\n",
59
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]adapter[^A-Za-z\\d])', 0).alias('is_adapter'),\n",
60
+ " regexp_extract(df.Title, '(?i)((sub)?woofer)', 0).alias('subwoofer'),\n",
61
+ " regexp_extract(df.Title, '(?i)([^A-Za-z&\\d](\\d+)( )*(w|watt)([^A-Za-z&\\d]|$))', 0).alias('power'),\n",
62
+ " regexp_extract(df.Title, '(?i)([\\d][.][\\d])', 0).alias('speakers_organization'),\n",
63
+ " regexp_extract(df.Title, '(?i)(surround)', 0).alias('surround'),\n",
64
+ " regexp_extract(df.Title, '(?i)(dlna)', 0).alias('dlna'),\n",
65
+ " regexp_extract(df.Title, '(?i)(wlan)', 0).alias('wlan'),\n",
66
+ " regexp_extract(df.Title, '(?i)(blu-ray|bluray)', 0).alias('blu_ray'),\n",
67
+ " regexp_extract(df.Title, '(?i)(streaming)', 0).alias('streaming'),\n",
68
+ " regexp_extract(df.Title, '(?i)(airplay)', 0).alias('airplay'),\n",
69
+ " regexp_extract(df.Title, '(?i)(([A-Za-z])*-dock| dock( |$)|dockingsystem)', 0).alias('is_dock'),\n",
70
+ " regexp_extract(df.Title, '(?i)(internet( )*radio)', 0).alias('internet_radio'),\n",
71
+ " regexp_extract(df.Title, '(?i)([ |-](mini|micro)[ |-])', 0).alias('small_speakers'),\n",
72
+ " regexp_extract(df.Title, '(?i)([ |-](cinema)[ |-])', 0).alias('cinema'),\n",
73
+ " regexp_extract(df.Title, '(?i)(((fm)( |-)radio)| fm | radio )', 0).alias('fm_radio'),\n",
74
+ " regexp_extract(df.Title, '(?i)(portable)', 0).alias('portable'),\n",
75
+ ").coalesce(1).write.mode('overwrite').csv('out.csv', header=True)"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": null,
81
+ "metadata": {
82
+ "collapsed": true
83
+ },
84
+ "outputs": [],
85
+ "source": [
86
+ ""
87
+ ]
88
+ }
89
+ ],
90
+ "metadata": {
91
+ "anaconda-cloud": {},
92
+ "kernelspec": {
93
+ "display_name": "Python [default]",
94
+ "language": "python",
95
+ "name": "python2"
96
+ },
97
+ "language_info": {
98
+ "codemirror_mode": {
99
+ "name": "ipython",
100
+ "version": 2.0
101
+ },
102
+ "file_extension": ".py",
103
+ "mimetype": "text/x-python",
104
+ "name": "python",
105
+ "nbconvert_exporter": "python",
106
+ "pygments_lexer": "ipython2",
107
+ "version": "2.7.12"
108
+ }
109
+ },
110
+ "nbformat": 4,
111
+ "nbformat_minor": 0
112
+}
exploration_notebooks/extract-cellphones.ipynb
....@@ -23,7 +23,7 @@
2323 "source": [
2424 "spark = SparkSession \\\n",
2525 " .builder \\\n",
26
- " .appName(\"Enirchment\") \\\n",
26
+ " .appName(\"Cellphones Enrichment\") \\\n",
2727 " .enableHiveSupport() \\\n",
2828 " .getOrCreate()"
2929 ]
exploration_notebooks/jdbc-mysql-connection.ipynb
....@@ -22,7 +22,7 @@
2222 "source": [
2323 "spark = SparkSession \\\n",
2424 " .builder \\\n",
25
- " .appName(\"Enirchment\") \\\n",
25
+ " .appName(\"Test MySQL JDBC Connection\") \\\n",
2626 " .enableHiveSupport() \\\n",
2727 " .getOrCreate()"
2828 ]
process.py
....@@ -0,0 +1,165 @@
1
+from pyspark.sql import SparkSession
2
+from pyspark.sql.functions import regexp_extract, regexp_replace
3
+from pyspark.sql.types import *
4
+from optparse import OptionParser
5
+import sys
6
+import re
7
+reload(sys)
8
+sys.setdefaultencoding('UTF8')
9
+
10
+
11
+def retrieve_string(x):
12
+ try:
13
+ return x.encode('utf-8').strip()
14
+ except AttributeError:
15
+ return None
16
+
17
+
18
+def extract_brand_name(x):
19
+ brand_info = retrieve_string(x.Brand)
20
+ man_info = retrieve_string(x.Manufacturer)
21
+ if brand_info is not None:
22
+ brand_sub = re.compile(brand_info, re.IGNORECASE)
23
+ res = brand_sub.sub("", retrieve_string(x.Title))
24
+ else:
25
+ res = retrieve_string(x.Title)
26
+ if man_info is not None:
27
+ man_sub = re.compile(man_info, re.IGNORECASE)
28
+ res = man_sub.sub("", res)
29
+ return (
30
+ res,
31
+ retrieve_string(x.Title),
32
+ retrieve_string(x.Brand),
33
+ retrieve_string(x.Manufacturer),
34
+ retrieve_string(x.ArtNumber)
35
+ )
36
+
37
+opt_parser = OptionParser()
38
+opt_parser.add_option("-f", "--filepath", dest="file_path",
39
+ help="Path to an input file")
40
+(opts, args) = opt_parser.parse_args()
41
+
42
+file_path = opts.file_path
43
+
44
+spark = SparkSession\
45
+ .builder\
46
+ .appName("Processing")\
47
+ .enableHiveSupport()\
48
+ .getOrCreate()
49
+
50
+# Handle brand names
51
+
52
+df = spark.read.csv(file_path, header=True, sep=';')
53
+rdd_data = df.rdd.map(
54
+ extract_brand_name
55
+)
56
+
57
+schema = StructType([
58
+ StructField('title', StringType()),
59
+ StructField('original_title', StringType()),
60
+ StructField('brand', StringType()),
61
+ StructField('manufacturer', StringType()),
62
+ StructField('art_number', StringType())
63
+ ])
64
+df = spark.createDataFrame(rdd_data, schema)
65
+
66
+df = df.select(
67
+ df.original_title,
68
+ df.title,
69
+ df.brand,
70
+ df.manufacturer,
71
+ df.art_number,
72
+ regexp_extract(
73
+ df.title,
74
+ '(?i)(tablet|phone|adapter|router|laptop|notebook|touch|ipad|iphone|macbook[A-Za-z]*)( |$)'
75
+ , 0
76
+ ).alias('device_type'),
77
+ regexp_extract(df.title, '(?i)(( |[^A-Za-z])(gt(x)?|hd|gf)( )*(\d+)m)', 0).alias('graphic_card'),
78
+ regexp_extract(df.title, '(?i)((/| )(\d+)+( +)?(v))+( |/|$)', 0).alias('voltage'),
79
+ regexp_extract(df.title, '(?i)(i(\d)-(\d+)([A-Za-z])?)', 0).alias('intel_cpu_info'),
80
+ regexp_extract(df.title, ' SSD ', 0).alias('is_ssd'),
81
+ regexp_extract(
82
+ df.title,
83
+ '(?i)((\d+)(,|[.])?( )* x (\d+)(,|.)?( *)?(c|m|m|k)m)([^A-Za-z]|$)',
84
+ 0
85
+ ).alias('dimensions'),
86
+ regexp_extract( df.title, '(?i)((\(| )(\d+)(,|[.])?(\d)*( )*((c|m|m|k)*m|zoll)([^A-Za-z]|$))', 0).alias('length'),
87
+ regexp_extract(df.title, '(?i)( (\d+)(,|[.])?(\d+)?( )*(m| |k)g([^A-Za-z]|$))', 0).alias('weight'),
88
+ regexp_extract(df.title, '(?i)([ |/]mac|win[" ",/,"d"]|linux|noos|android)', 0).alias('OS'),
89
+ regexp_extract(df.title, '(?i)( (\d+)(,|[.])?(\d+)?( )*([m|k|]bit))', 0).alias('throughput'),
90
+ regexp_extract(df.title, '(?i)(ddr(\d+))', 0).alias('memory_type'),
91
+ regexp_extract(df.title, '(?i)( (\d+)(,|[.])?(\d+)?( )*(rpm))', 0).alias('rpm'),
92
+ regexp_extract(df.title, '(?i)((\d+):(\d+))', 0).alias('screen_resolution'),
93
+ regexp_extract(df.title, '(?i)((full| )( |-)*)*hd(?i)((full| )( |-)*)*hd[^A-Za-z\d]', 0).alias('screen_definition'),
94
+ regexp_extract(df.title, '(?i)([^A-Za-z\d](hdmi|dvi|vga))+([^A-Za-z]|$)+', 0).alias('display_port'),
95
+ regexp_extract(df.title, '(?i)((\d+)[.|,]?(\d*)[^A-Za-z\d]*(m|g)p([^A-Za-z]|$))', 0).alias('pixels'),
96
+ regexp_extract(df.title, '(?i)([^A-Za-z\d]hdmi[^A-Za-z\d])', 0).alias('hdmi'),
97
+ regexp_extract(df.title, '(?i)([^A-Za-z\d]ips[^A-Za-z\d])', 0).alias('ips'),
98
+ regexp_extract(
99
+ df.title,
100
+ '(?i)(usb|wired|bluetooth|gaming|optical|wireless|ergonomic)*( )?( speaker | mouse | keyboard | monitor | sleeve | case )'
101
+ , 0
102
+ ).alias('equipment_type'),
103
+ regexp_extract(df.title, '(?i)(\S+-core)|((intel)?( |/|-)core( )?i\d+)|((dual|quad|hexa)( )?core)',
104
+ 0).alias('cpu_core_family'),
105
+ regexp_extract(df.title, '(?i)([^A-Za-z\d](led|lcd|tft)[^A-Za-z\d])+', 0).alias('monitor_type'),
106
+ regexp_extract(df.title, '(?i)([A-Za-z]*-([A-za-z]+\d+)[A-Za-z]*)', 0).alias('sony_camera_model'),
107
+ regexp_extract(df.title, '(?i)( f[/]? *(\d+[.|,]?(\d*)( )*l))|( f[/]? *(\d+[.|,]?(\d*)( )*)-\d+[.|,]?(\d*))',
108
+ 0).alias('focal_length'),
109
+ regexp_extract(df.title, '(?i)(objektiv\w*)', 0).alias('objektiv'),
110
+ regexp_extract(df.title, '(?i)(( )+usm( )+)', 0).alias('usm'),
111
+ regexp_extract(df.title, '(?i)([^A-Za-z\d]retina([^A-Za-z\d]|$))', 0).alias('retina'),
112
+ regexp_extract(df.title, '(?i)(| |/)(\d+)(,|[.])?(\d)*( )*((c|m|k)*l|liter)( |$)', 0).alias('volume'),
113
+ regexp_extract(df.title,
114
+ '(?i)(\d+)[,|.]*(\d*)( )*x( )*(\d+)[,|.]?(\d*)( )*x( )*(\d+)[,|.]*(\d*)( )*(m|c|k)m([^A-Za-z]|$)',
115
+ 0).alias('triple_dimensions'),
116
+ regexp_extract(df.title, '(?i)(| |/)(\d+)[,|.]?(\d)*( )*(%)+', 0).alias('concentration'),
117
+ regexp_extract(df.title, '(?i)(\d+)( )*x( )*(\d+)( )*(c|m|k)*l( |$)', 0).alias(''),
118
+ regexp_extract(df.title, '(?i)(headphones|headset|earphones|earbuds|speaker)', 0).alias('device_family'),
119
+ regexp_extract(df.title, '(?i)((^|[^A-Za-z\d])stereo[^A-Za-z\d])', 0).alias('is_stereo'),
120
+ regexp_extract(df.title, '(?i)(noise([ |-]cancelling)?([^A-Zaz-z\d]|$))', 0).alias('noise_cancelling'),
121
+ regexp_extract(df.title, '(?i)([^A-Za-z\d]adapter[^A-Za-z\d])', 0).alias('is_adapter'),
122
+ regexp_extract(df.title, '(?i)((sub)?woofer)', 0).alias('subwoofer'),
123
+ regexp_extract(df.title, '(?i)([\d][.][\d])', 0).alias('speakers_organization'),
124
+ regexp_extract(df.title, '(?i)(surround)', 0).alias('surround'),
125
+ regexp_extract(df.title, '(?i)(dlna)', 0).alias('dlna'),
126
+ regexp_extract(df.title, '(?i)(blu-ray|bluray)', 0).alias('blu_ray'),
127
+ regexp_extract(df.title, '(?i)(streaming)', 0).alias('streaming'),
128
+ regexp_extract(df.title, '(?i)(airplay)', 0).alias('airplay'),
129
+ regexp_extract(df.title, '(?i)(([A-Za-z])*-dock| dock( |$)|dockingsystem)', 0).alias('is_dock'),
130
+ regexp_extract(df.title, '(?i)(internet( )*radio)', 0).alias('internet_radio'),
131
+ regexp_extract(df.title, '(?i)([ |-](mini|micro)[ |-])', 0).alias('small_speakers'),
132
+ regexp_extract(df.title, '(?i)([ |-](cinema)[ |-])', 0).alias('cinema'),
133
+ regexp_extract(df.title, '(?i)(((fm)( |-)radio)| fm | radio )', 0).alias('fm_radio'),
134
+ regexp_extract(df.title, '(?i)(portable)', 0).alias('portable'),
135
+ regexp_extract(df.title, '(?i)([^A-Za-z\d]lte)[^A-Za-z]', 0).alias('lte'),
136
+ regexp_extract(df.title, '(?i)([^A-Za-z\d]*(o2|telekom|vodafone|e-plus|mowotel)[^A-Za-z])', 0).alias('operator'),
137
+ regexp_extract(df.title, '(?i)([^A-Za-z\d](\d+( )?(mon.|monat|month)+)*( )?(vertrag|contract))', 0).alias('contract'),
138
+ regexp_extract(df.title, '(?i)([^A-Za-z\d](mobile( )?(data|internet)))', 0).alias('mobile_internet'),
139
+ regexp_extract(df.title, '(?i)([^A-Za-z\d]umts)[^A-Za-z]', 0).alias('umts'),
140
+ regexp_extract(df.title, '(?i)([^A-Za-z\d]wlan)[^A-Za-z]', 0).alias('wlan'),
141
+ regexp_extract(df.title, '(?i)([^A-Za-z\d]router)[^A-Za-z]', 0).alias('router'),
142
+ regexp_extract(df.title, '(?i)(tastatur|keyboard( )?dock)', 0).alias('keyboard_dock'),
143
+ regexp_extract(df.title, '(?i)( (\d+)(,|[.])?(\d+)?( )*(mah))', 0).alias('battery_capacity'),
144
+ regexp_extract(df.title, '(?i)((| )(\d+)+( +)?((g|k|m|t)B))+([^A-Za-z]|$)', 0).alias('memory'),
145
+ regexp_extract(df.title, '(?i)(dualsim|dual-sim|dual sim|nano sim|nano-sim|nanosim|nano-micro-sim|microsim|micro sim|micro-sim)', 0).alias('sim_card_type'),
146
+ regexp_extract(df.title, '(?i)((mini|micro)*( |-)?usb( )?((\d+)?(,|[.])?(\d+))?)', 0).alias('usb_type'),
147
+ regexp_extract(df.title, '(?i)bluetooth', 0).alias('bluetooth'),
148
+ regexp_extract(df.title, '(?i)(wifi|wi-fi|wi fi|wireless)', 0).alias('wifi'),
149
+ regexp_extract(df.title, '(?i)([^A-Za-z&\d](\d+)( )*(w|watt)([^A-Za-z&\d]|$))', 0).alias('power'),
150
+ regexp_extract(df.title, '(?i)( (\d+)(,|.)?(\d)*( )*(k|m|g)*hz)', 0).alias('cpu_power'),
151
+ regexp_extract(df.title, '(?i)( wide |widescreen|gaming monitor|wide screen|ultra( )?wide)+|(touch( )?screen)', 0).alias('screen_features'),
152
+ regexp_extract(df.title, '(?i)([^A-Za-z\d]charger[^A-Za-z\d])', 0).alias('is_charger'),
153
+ regexp_extract(df.title, '(?i)([^A-Za-z\d]hspa)[^A-Za-z]', 0).alias('hspa'),
154
+ regexp_extract(df.title, '(?i)([^A-Za-z\d]hotspot)[^A-Za-z]', 0).alias('hotspot'),
155
+ regexp_extract(df.title, '(?i)([^A-Za-z\d]vga)[^A-Za-z]', 0).alias('vga'),
156
+ regexp_extract(df.title, '(?i)(^|[^A-Za-z\d])(headphones|headset|earphones)([^A-Za-z\d]|$)', 0).alias('headphones'),
157
+ regexp_extract(df.title, '(?i)(android( )?(\d+(.)?(\d*))[^A-Za-z]|$)', 0).alias('android'),
158
+ regexp_extract(df.title, '(?i)([^A-Za-z\d]tablet([^A-Za-z\d]|$))', 0).alias('is_tablet'),
159
+ regexp_extract(df.title, '(?i)([^A-Za-z\d]nfc([^A-Za-z\d]|$))', 0).alias('nfc'),
160
+ regexp_extract(df.title, '(?i)gps', 0).alias('gps'),
161
+ regexp_extract(df.title, '(?i)((^|[^A-Za-z\d](2|3|4)g($|[^A-Za-z\d])))', 0).alias('mobile_network_type'),
162
+ regexp_extract(df.title, '(?i)(([a-z]|[^A-Za-z\d])sata( |-)?(\d)*[^A-Za-z])', 0).alias('sata_disk_family')
163
+)
164
+
165
+df.coalesce(1).write.mode('overwrite').csv('/home/agaton/Desktop/out.csv', header=True)