abea5faa73e8dba16a56127f2f928c9b96611415..ec885e259f31c157350ed0d51925b30e573908bb
2017-04-19 milovanovicm
EPD-24 Extract cellphones features - added exploration notebook;
ec885e diff | tree
2017-04-19 milovanovicm
EPD-29 Testing smart commits
b626bf diff | tree
1 files added
1 files deleted
1 files modified
337 ■■■■■ changed files
exploration_notebooks/explore-enrichment.ipynb 33 ●●●●● patch | view | raw | blame | history
exploration_notebooks/extract-cellphones.ipynb 119 ●●●●● patch | view | raw | blame | history
explore-enrichment.ipynb 185 ●●●●● patch | view | raw | blame | history
exploration_notebooks/explore-enrichment.ipynb
....@@ -2,7 +2,7 @@
22 "cells": [
33 {
44 "cell_type": "code",
5
- "execution_count": null,
5
+ "execution_count": 1,
66 "metadata": {
77 "collapsed": true
88 },
....@@ -15,7 +15,7 @@
1515 },
1616 {
1717 "cell_type": "code",
18
- "execution_count": null,
18
+ "execution_count": 2,
1919 "metadata": {
2020 "collapsed": true
2121 },
....@@ -30,6 +30,35 @@
3030 },
3131 {
3232 "cell_type": "code",
33
+ "execution_count": 8,
34
+ "metadata": {
35
+ "collapsed": false
36
+ },
37
+ "outputs": [
38
+ {
39
+ "name": "stdout",
40
+ "output_type": "stream",
41
+ "text": [
42
+ "+-----------------------+\n",
43
+ "|levenshtein(col1, col2)|\n",
44
+ "+-----------------------+\n",
45
+ "|2 |\n",
46
+ "|0 |\n",
47
+ "|5 |\n",
48
+ "+-----------------------+\n",
49
+ "\n"
50
+ ]
51
+ }
52
+ ],
53
+ "source": [
54
+ "from pyspark.sql.functions import levenshtein, lit\n",
55
+ "data = [('Alice', 'Alicio'), ('Alice', 'Alice'), ('Alice', 'Bob')]\n",
56
+ "df = spark.createDataFrame(data, ['col1', 'col2'])\n",
57
+ "df.select(levenshtein('col1', 'col2')).show(truncate=False)"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
3362 "execution_count": 59,
3463 "metadata": {
3564 "collapsed": false
exploration_notebooks/extract-cellphones.ipynb
....@@ -0,0 +1,119 @@
1
+{
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {
7
+ "collapsed": true
8
+ },
9
+ "outputs": [],
10
+ "source": [
11
+ "from pyspark.sql import SparkSession\n",
12
+ "from pyspark.sql.functions import regexp_extract, regexp_replace\n",
13
+ "import spark_jupyter"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 2,
19
+ "metadata": {
20
+ "collapsed": true
21
+ },
22
+ "outputs": [],
23
+ "source": [
24
+ "spark = SparkSession \\\n",
25
+ " .builder \\\n",
26
+ " .appName(\"Enirchment\") \\\n",
27
+ " .enableHiveSupport() \\\n",
28
+ " .getOrCreate()"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 5,
34
+ "metadata": {
35
+ "collapsed": false
36
+ },
37
+ "outputs": [],
38
+ "source": [
39
+ "df = spark.read.csv('../datasets/affilinet_products-cyberport_341_666756.txt', header=True, sep=';')"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": 26,
45
+ "metadata": {
46
+ "collapsed": false
47
+ },
48
+ "outputs": [],
49
+ "source": [
50
+ "df.select(\n",
51
+ " df.Title,\n",
52
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]lte)[^A-Za-z]', 0).alias('lte'),\n",
53
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]*(o2|telekom|vodafone|e-plus|mowotel)[^A-Za-z])', 0).alias('operator'),\n",
54
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d](\\d+( )?(mon.|monat|month)+)*( )?(vertrag|contract))', 0).alias('contract'),\n",
55
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d](mobile( )?(data|internet)))', 0).alias('mobile_internet'),\n",
56
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]umts)[^A-Za-z]', 0).alias('umts'),\n",
57
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]wlan)[^A-Za-z]', 0).alias('wlan'),\n",
58
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]router)[^A-Za-z]', 0).alias('router'),\n",
59
+ " regexp_extract(df.Title, '(?i)(tastatur|keyboard( )?dock)', 0).alias('keyboard_dock'),\n",
60
+ " regexp_extract(df.Title, '(?i)( (\\d+)(,|[.])?(\\d+)?( )*(mah))', 0).alias('battery_capacity'),\n",
61
+ " regexp_extract(df.Title, '(?i)((| )(\\d+)+( +)?((g|k|m|t)B))+([^A-Za-z]|$)', 0).alias('memory'),\n",
62
+ " regexp_extract(df.Title, '(?i)(dualsim|dual-sim|dual sim|nano sim|nano-sim|nanosim|nano-micro-sim|microsim|micro sim|micro-sim)', 0).alias('sim_card_type'),\n",
63
+ " regexp_extract(df.Title, '(?i)((mini|micro)*( |-)?usb( )?((\\d+)?(,|[.])?(\\d+))?)', 0).alias('usb_type'),\n",
64
+ " regexp_extract(df.Title, '(?i)bluetooth', 0).alias('bluetooth'),\n",
65
+ " regexp_extract(df.Title, '(?i)(wifi|wi-fi|wi fi|wireless)', 0).alias('wifi'),\n",
66
+ " regexp_extract(df.Title, '(?i)([^A-Za-z&\\d](\\d+)( )*(w|watt)([^A-Za-z&\\d]|$))', 0).alias('power'),\n",
67
+ " regexp_extract(df.Title, '(?i)( (\\d+)(,|.)?(\\d)*( )*(k|m|g)*hz)', 0).alias('cpu_power'),\n",
68
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]hdmi[^A-Za-z\\d])', 0).alias('hdmi'),\n",
69
+ " regexp_extract(df.Title, '(?i)( wide |widescreen|gaming monitor|wide screen|ultra( )?wide)+|(touch( )?screen)', 0).alias('screen_features'),\n",
70
+ " regexp_extract(df.Title, '(?i)((^|[^A-Za-z\\d])stereo[^A-Za-z\\d])', 0).alias('is_stereo'),\n",
71
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]adapter[^A-Za-z\\d])', 0).alias('is_adapter'),\n",
72
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]charger[^A-Za-z\\d])', 0).alias('is_charger'),\n",
73
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]hspa)[^A-Za-z]', 0).alias('hspa'),\n",
74
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]hotspot)[^A-Za-z]', 0).alias('hotspot'),\n",
75
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]vga)[^A-Za-z]', 0).alias('vga'),\n",
76
+ " regexp_extract(df.Title, '(?i)((\\(| )(\\d+)(,|[.])?(\\d)*( )*((c|m|m|k)*m|zoll)([^A-Za-z]|$))', 0).alias('length'),\n",
77
+ " regexp_extract(df.Title, '(?i)(^|[^A-Za-z\\d])(headphones|headset|earphones)([^A-Za-z\\d]|$)', 0).alias('headphones'),\n",
78
+ " regexp_extract(df.Title, '(?i)(android( )?(\\d+(.)?(\\d*))[^A-Za-z]|$)', 0).alias('android'),\n",
79
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]tablet([^A-Za-z\\d]|$))', 0).alias('is_tablet'),\n",
80
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]nfc([^A-Za-z\\d]|$))', 0).alias('nfc'),\n",
81
+ ").coalesce(1).write.mode('overwrite').csv('out.csv', header=True)\n",
82
+ "# show(truncate=False)"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": null,
88
+ "metadata": {
89
+ "collapsed": true
90
+ },
91
+ "outputs": [],
92
+ "source": [
93
+ ""
94
+ ]
95
+ }
96
+ ],
97
+ "metadata": {
98
+ "anaconda-cloud": {},
99
+ "kernelspec": {
100
+ "display_name": "Python [default]",
101
+ "language": "python",
102
+ "name": "python2"
103
+ },
104
+ "language_info": {
105
+ "codemirror_mode": {
106
+ "name": "ipython",
107
+ "version": 2.0
108
+ },
109
+ "file_extension": ".py",
110
+ "mimetype": "text/x-python",
111
+ "name": "python",
112
+ "nbconvert_exporter": "python",
113
+ "pygments_lexer": "ipython2",
114
+ "version": "2.7.12"
115
+ }
116
+ },
117
+ "nbformat": 4,
118
+ "nbformat_minor": 0
119
+}
explore-enrichment.ipynb
deleted file mode 100644
....@@ -1,185 +0,0 @@
1
-{
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {
7
- "collapsed": true
8
- },
9
- "outputs": [],
10
- "source": [
11
- "from pyspark.sql import SparkSession\n",
12
- "from pyspark.sql.functions import regexp_extract, regexp_replace\n",
13
- "import spark_jupyter"
14
- ]
15
- },
16
- {
17
- "cell_type": "code",
18
- "execution_count": 2,
19
- "metadata": {
20
- "collapsed": true
21
- },
22
- "outputs": [],
23
- "source": [
24
- "spark = SparkSession \\\n",
25
- " .builder \\\n",
26
- " .appName(\"Enirchment\") \\\n",
27
- " .enableHiveSupport() \\\n",
28
- " .getOrCreate()"
29
- ]
30
- },
31
- {
32
- "cell_type": "code",
33
- "execution_count": 48,
34
- "metadata": {
35
- "collapsed": false
36
- },
37
- "outputs": [],
38
- "source": [
39
- "df = spark.read.csv('/home/agaton/Downloads/affilinet_products-toysrus_1401_666756.txt', header=True, sep=';')\n",
40
- "# df = spark.read.csv('affilinet_products-fashion-for-home-mbel_1073_666756 (1).txt', header=True, sep=';')\n",
41
- "# df = spark.read.csv('affilinet_products-mactrade-appel_256_666756.txt', header=True, sep=';')"
42
- ]
43
- },
44
- {
45
- "cell_type": "code",
46
- "execution_count": 49,
47
- "metadata": {
48
- "collapsed": false
49
- },
50
- "outputs": [],
51
- "source": [
52
- "# from pyspark.sql.functions import udf\n",
53
- "# from pyspark.sql.types import *\n",
54
- "# df"
55
- ]
56
- },
57
- {
58
- "cell_type": "code",
59
- "execution_count": 50,
60
- "metadata": {
61
- "collapsed": false
62
- },
63
- "outputs": [],
64
- "source": [
65
- "# df.show(200, truncate=False)"
66
- ]
67
- },
68
- {
69
- "cell_type": "code",
70
- "execution_count": 51,
71
- "metadata": {
72
- "collapsed": false
73
- },
74
- "outputs": [],
75
- "source": [
76
- "df.select(\n",
77
- " df.Title,\n",
78
- " regexp_extract(df.Title, '(?i)(( |/)(\\d+)( )*(w|watt)([^A-Za-z]| |/|$))', 0).alias('power'),\n",
79
- " regexp_extract(df.Title, '(?i)((/| )(\\d+)+( +)?(v))+( |/|$)', 0).alias('voltage'),\n",
80
- " regexp_extract(df.Title, '(?i)((| )(\\d+)+( +)?((g|k|m|t)B))+([^A-Za-z]|$)', 0).alias('memory'),\n",
81
- " regexp_extract(df.Title, '(i(\\d)-(\\d+)([A-Za-z])?)', 0).alias('intel_cpuinfo'),\n",
82
- " regexp_extract(df.Title, '(?i)( (\\d+)(,|.)?(\\d)*( )*(k|m|g)*hz)', 0).alias('cpu_power'),\n",
83
- " regexp_extract(df.Title, ' SSD ', 0).alias('disk_type'),\n",
84
- " regexp_extract(df.Title, '(?i)((\\d+)(,|[.])?( )* x (\\d+)(,|.)?( *)?(c|m|m|k)m)([^A-Za-z]|$)', 0).alias('dimensions'),\n",
85
- " regexp_extract(df.Title, '(?i)( (\\d+)(,|[.])?(\\d)*( )*(c|m|m|k)*m)([^A-Za-z]|$)', 0).alias('length'),\n",
86
- " regexp_extract(df.Title, '(?i)( (\\d+)(,|[.])?(\\d+)?( )*(m| |k)g([^A-Za-z]|$))', 0).alias('weight'),\n",
87
- " regexp_extract(df.Title, '(?i)( (\\d+)(,|[.])?(\\d+)?( )*(mah))', 0).alias('battery_capacity'),\n",
88
- " regexp_extract(df.Title, '(?i)([ |/]mac|win[\" \",/,\"d\"]|linux)', 0).alias('OS'),\n",
89
- " regexp_extract(df.Title, '(?i)( (\\d+)(,|[.])?(\\d+)?( )*([m|k|]bit))', 0).alias('throughput'),\n",
90
- " regexp_extract(df.Title, '(?i)(ddr(\\d+))', 0).alias('memory_type'),\n",
91
- " regexp_extract(df.Title, '(?i)( (\\d+)(,|[.])?(\\d+)?( )*(rpm))', 0).alias('rpm'),\n",
92
- " regexp_extract(df.Title, '(?i)((\\d+):(\\d+))', 0).alias('screen_resolution'),\n",
93
- " regexp_extract(df.Title, '(?i)(usb( )?((\\d+)?(,|[.])?(\\d+))?)', 0).alias('usb_type'),\n",
94
- " regexp_extract(df.Title, '(?i)((full)( |-)*)*hd', 0).alias('screen_definition'),\n",
95
- " regexp_extract(df.Title, '(?i)((vga)*(dvi)*(hdmi)*)', 0).alias('display_port')\n",
96
- ").coalesce(1).write.mode('overwrite').csv('out.csv', header=True)\n",
97
- "# show(500, truncate=False)"
98
- ]
99
- },
100
- {
101
- "cell_type": "code",
102
- "execution_count": 58,
103
- "metadata": {
104
- "collapsed": false
105
- },
106
- "outputs": [
107
- {
108
- "name": "stdout",
109
- "output_type": "stream",
110
- "text": [
111
- "4GB\n",
112
- "500GB\n"
113
- ]
114
- }
115
- ],
116
- "source": [
117
- "sample = 'Acer TravelMate P253-M-33114G50Maks i3-3110M 4GB 500GB matt Win7 Pro + Win8 Pro'\n",
118
- "import re\n",
119
- "pattern = '\\d*( |)[A-Za-z]B'\n",
120
- "for match in re.finditer(pattern, sample, re.DOTALL):\n",
121
- " print match.group(0)"
122
- ]
123
- },
124
- {
125
- "cell_type": "code",
126
- "execution_count": 52,
127
- "metadata": {
128
- "collapsed": false
129
- },
130
- "outputs": [],
131
- "source": [
132
- "df = spark.read.csv('/home/agaton/Downloads/affilinet_products-basler-kosmetik_1289_666756.txt', header=True, sep=';')"
133
- ]
134
- },
135
- {
136
- "cell_type": "code",
137
- "execution_count": 60,
138
- "metadata": {
139
- "collapsed": true
140
- },
141
- "outputs": [],
142
- "source": [
143
- "df.select(\n",
144
- " df.Title,\n",
145
- " regexp_extract(df.Title, '(?i)(| |/)(\\d+)(,|[.])?(\\d)*( )*((c|m|k)*l|liter)( |$)', 0).alias('volume'),\n",
146
- " regexp_extract(df.Title, '(?i)( (\\d+)(,|[.])?(\\d)*( )*(c|m|m|k)*m)([^A-Za-z]|$)', 0).alias('length'),\n",
147
- " regexp_extract(df.Title, '(?i)( (\\d+)(,|[.])?(\\d+)?( )*(m| |k)g([^A-Za-z]|$))', 0).alias('weight'),\n",
148
- " regexp_extract(df.Title, '(?i)(| |/)(\\d+)[,|.]?(\\d)*( )*(%)+', 0).alias('concentration'),\n",
149
- " regexp_extract(df.Title, '(?i)(\\d+)( )*x( )*(\\d+)( )*(c|m|k)*l( |$)', 0).alias('packaging_volume')\n",
150
- ").coalesce(1).write.mode('overwrite').csv('kosmetic.csv', header=True)"
151
- ]
152
- },
153
- {
154
- "cell_type": "code",
155
- "execution_count": null,
156
- "metadata": {
157
- "collapsed": true
158
- },
159
- "outputs": [],
160
- "source": []
161
- }
162
- ],
163
- "metadata": {
164
- "anaconda-cloud": {},
165
- "kernelspec": {
166
- "display_name": "Python [default]",
167
- "language": "python",
168
- "name": "python2"
169
- },
170
- "language_info": {
171
- "codemirror_mode": {
172
- "name": "ipython",
173
- "version": 2
174
- },
175
- "file_extension": ".py",
176
- "mimetype": "text/x-python",
177
- "name": "python",
178
- "nbconvert_exporter": "python",
179
- "pygments_lexer": "ipython2",
180
- "version": "2.7.12"
181
- }
182
- },
183
- "nbformat": 4,
184
- "nbformat_minor": 1
185
-}