3 files added
269 ■■■■■ changed files
exploration_notebooks/extract-cellphone-accessories.ipynb 79 ●●●●● patch | view | raw | blame | history
exploration_notebooks/extract-network-appliances.ipynb 96 ●●●●● patch | view | raw | blame | history
exploration_notebooks/extract-ram-memory.ipynb 94 ●●●●● patch | view | raw | blame | history
exploration_notebooks/extract-cellphone-accessories.ipynb
....@@ -0,0 +1,79 @@
1
+{
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {
7
+ "collapsed": true
8
+ },
9
+ "outputs": [],
10
+ "source": [
11
+ "from pyspark.sql import SparkSession\n",
12
+ "from pyspark.sql.functions import regexp_extract, regexp_replace\n",
13
+ "import spark_jupyter"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 2,
19
+ "metadata": {
20
+ "collapsed": true
21
+ },
22
+ "outputs": [],
23
+ "source": [
24
+ "spark = SparkSession \\\n",
25
+ " .builder \\\n",
26
+ " .appName(\"Enirchment\") \\\n",
27
+ " .enableHiveSupport() \\\n",
28
+ " .getOrCreate()"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 3,
34
+ "metadata": {
35
+ "collapsed": true
36
+ },
37
+ "outputs": [],
38
+ "source": [
39
+ "df = spark.read.csv('../datasets/affilinet_products-cyberport_341_666756.txt', header=True, sep=';')"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": null,
45
+ "metadata": {
46
+ "collapsed": true
47
+ },
48
+ "outputs": [],
49
+ "source": [
50
+ "df.select(\n",
51
+ " df.Title,\n",
52
+ " regexp_extract(df.Title, '', 0).alias(''),\n",
53
+ ").coalesce(1).write.mode('overwrite').csv('out.csv', header=True)"
54
+ ]
55
+ }
56
+ ],
57
+ "metadata": {
58
+ "anaconda-cloud": {},
59
+ "kernelspec": {
60
+ "display_name": "Python [default]",
61
+ "language": "python",
62
+ "name": "python2"
63
+ },
64
+ "language_info": {
65
+ "codemirror_mode": {
66
+ "name": "ipython",
67
+ "version": 2
68
+ },
69
+ "file_extension": ".py",
70
+ "mimetype": "text/x-python",
71
+ "name": "python",
72
+ "nbconvert_exporter": "python",
73
+ "pygments_lexer": "ipython2",
74
+ "version": "2.7.12"
75
+ }
76
+ },
77
+ "nbformat": 4,
78
+ "nbformat_minor": 1
79
+}
exploration_notebooks/extract-network-appliances.ipynb
....@@ -0,0 +1,96 @@
1
+{
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {
7
+ "collapsed": true
8
+ },
9
+ "outputs": [],
10
+ "source": [
11
+ "from pyspark.sql import SparkSession\n",
12
+ "from pyspark.sql.functions import regexp_extract, regexp_replace\n",
13
+ "import spark_jupyter"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 2,
19
+ "metadata": {
20
+ "collapsed": true
21
+ },
22
+ "outputs": [],
23
+ "source": [
24
+ "spark = SparkSession \\\n",
25
+ " .builder \\\n",
26
+ " .appName(\"Audio Enrichment\") \\\n",
27
+ " .enableHiveSupport() \\\n",
28
+ " .getOrCreate()"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 3,
34
+ "metadata": {
35
+ "collapsed": true
36
+ },
37
+ "outputs": [],
38
+ "source": [
39
+ "df = spark.read.csv('../datasets/affilinet_products-cyberport_341_666756.txt', header=True, sep=';')"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": 5,
45
+ "metadata": {
46
+ "collapsed": false
47
+ },
48
+ "outputs": [],
49
+ "source": [
50
+ "df.select(\n",
51
+ " df.Title,\n",
52
+ " df.ProductCategoryName,\n",
53
+ " regexp_extract(df.Title, '(?i)(wifi|wi-fi|wi fi|wireless)', 0).alias('memory'),\n",
54
+ " regexp_extract(df.Title, '(?i)(wlan-[A-Za-z]|wlan)', 0).alias('wlan'),\n",
55
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]adapter[^A-Za-z\\d])', 0).alias('is_adapter'),\n",
56
+ " regexp_extract(df.Title, '(?i)( (\\d+)(,|[.])?(\\d+)?( )*([m|k|]bit))', 0).alias('throughput'),\n",
57
+ " regexp_extract(df.Title, '(?i)(dualband|dual band|dual-band)', 0).alias('dual_band'),\n",
58
+ " regexp_extract(df.Title, '(?i)((^|[^A-Za-z\\d])stick($|[^A-Za-z\\d]))', 0).alias('is_stick'),\n",
59
+ " regexp_extract(df.Title, '(?i)((^|[^A-Za-z\\d])lan($|[^A-Za-z\\d]))', 0).alias('lan'),\n",
60
+ " regexp_extract(df.Title, '(?i)(access[ |-]?point)', 0).alias('access_point'),\n",
61
+ " regexp_extract(df.Title, '(?i)(hotspot)', 0).alias('hotspot'),\n",
62
+ " regexp_extract(df.Title, '(?i)(gateway)', 0).alias('gateway'),\n",
63
+ " regexp_extract(df.Title, '(?i)(outdoor|indoor)', 0).alias('indoor_outdoor'),\n",
64
+ " regexp_extract(df.Title, '(?i)antenn[a|e]', 0).alias('antenna'),\n",
65
+ " regexp_extract(df.Title, '(?i)((\\d+)( )?(dbi|dbd))', 0).alias('antenna_gain'),\n",
66
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]bridge[^A-Za-z\\d])', 0).alias('is_bridge'),\n",
67
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]switch[^A-Za-z\\d])', 0).alias('is_switch'),\n",
68
+ " regexp_extract(df.Title, '(?i)(pci[ |-]card)', 0).alias('pci_card'),\n",
69
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]router[^A-Za-z\\d])', 0).alias('router'),\n",
70
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]dongle[^A-Za-z\\d])', 0).alias('is_dongle'),\n",
71
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]repeater[^A-Za-z\\d])', 0).alias('is_repeater'),\n",
72
+ " regexp_extract(df.Title, '(?i)([^A-Za-z\\d]repeater[^A-Za-z\\d])'\t, 0).alias('is_controler'),\n",
73
+ ").coalesce(1).write.mode('overwrite').csv('out.csv', header=True)"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": null,
79
+ "metadata": {
80
+ "collapsed": true
81
+ },
82
+ "outputs": [],
83
+ "source": []
84
+ }
85
+ ],
86
+ "metadata": {
87
+ "anaconda-cloud": {},
88
+ "kernelspec": {
89
+ "display_name": "Python [default]",
90
+ "language": "python",
91
+ "name": "python2"
92
+ }
93
+ },
94
+ "nbformat": 4,
95
+ "nbformat_minor": 1
96
+}
exploration_notebooks/extract-ram-memory.ipynb
....@@ -0,0 +1,94 @@
1
+{
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {
7
+ "collapsed": true
8
+ },
9
+ "outputs": [],
10
+ "source": [
11
+ "from pyspark.sql import SparkSession\n",
12
+ "from pyspark.sql.functions import regexp_extract, regexp_replace\n",
13
+ "import spark_jupyter"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 2,
19
+ "metadata": {
20
+ "collapsed": true
21
+ },
22
+ "outputs": [],
23
+ "source": [
24
+ "spark = SparkSession \\\n",
25
+ " .builder \\\n",
26
+ " .appName(\"Audio Enrichment\") \\\n",
27
+ " .enableHiveSupport() \\\n",
28
+ " .getOrCreate()"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 3,
34
+ "metadata": {
35
+ "collapsed": true
36
+ },
37
+ "outputs": [],
38
+ "source": [
39
+ "df = spark.read.csv('../datasets/affilinet_products-cyberport_341_666756.txt', header=True, sep=';')"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": 6,
45
+ "metadata": {
46
+ "collapsed": true
47
+ },
48
+ "outputs": [],
49
+ "source": [
50
+ "df.select(\n",
51
+ " df.Title,\n",
52
+ " df.ProductCategoryName,\n",
53
+ " regexp_extract(df.Title, '(?i)((| )(\\d+)+( +)?((g|k|m|t)B))+([^A-Za-z]|$)', 0).alias('memory'),\n",
54
+ " regexp_extract(df.Title, '(?i)((^|[^A-Za-z\\d])(\\d+)[.|,]?(\\d*)[ ]?v($|[^A-Za-z\\d]))', 0).alias('voltage'),\n",
55
+ " regexp_extract(df.Title, '(?i)((ddr[\\d](-|[A-Za-z])(\\d*))|(ddr( |-|)(\\d*)))', 0).alias('ddr_type'),\n",
56
+ " regexp_extract(df.Title, '(?i)((^|[^A-Za-z\\d])((so|fb)[ |-]?)?dimm([^A-Za-z\\d])|$)', 0).alias('dimm_module'),\n",
57
+ " regexp_extract(df.Title, '(?i)(pc(\\d|-)?[-]?(\\d*))', 0).alias('memory_module_name'),\n",
58
+ " regexp_extract(df.Title, '(?i)((^|[^A-Za-z\\d])ram([^A-Za-z\\d]|$))', 0).alias('is_ram'),\n",
59
+ " regexp_extract(df.Title, '(?i)((^|[^A-Za-z\\d])ecc([^A-Za-z\\d]|$))', 0).alias('ecc_memory'),\n",
60
+ ").coalesce(1).write.mode('overwrite').csv('out.csv', header=True)"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": null,
66
+ "metadata": {
67
+ "collapsed": true
68
+ },
69
+ "outputs": [],
70
+ "source": []
71
+ }
72
+ ],
73
+ "metadata": {
74
+ "kernelspec": {
75
+ "display_name": "Python [default]",
76
+ "language": "python",
77
+ "name": "python2"
78
+ },
79
+ "language_info": {
80
+ "codemirror_mode": {
81
+ "name": "ipython",
82
+ "version": 2
83
+ },
84
+ "file_extension": ".py",
85
+ "mimetype": "text/x-python",
86
+ "name": "python",
87
+ "nbconvert_exporter": "python",
88
+ "pygments_lexer": "ipython2",
89
+ "version": "2.7.12"
90
+ }
91
+ },
92
+ "nbformat": 4,
93
+ "nbformat_minor": 1
94
+}