{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Reading & Writing Files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plain Text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['1. This is file foo.',\n", " '2. Just for demo, no practical utilities.',\n", " '3. hahahah',\n", " '4. This is line 4',\n", " '',\n", " '6. Line 5 (previous line) is empty']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with open('prac/foo.txt', 'r', encoding='utf-8') as f:\n", " file = [line.strip() for line in f]\n", "\n", "file" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Write" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "with open('prac/write.txt', 'w', encoding='utf-8') as f:\n", " for line in file:\n", " f.write(line + '\\n')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## JSON" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read JSON\n", "\n", "#### Read from file" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "373\n" ] }, { "data": { "text/plain": [ "{'alias': 'midnightlab',\n", " 'availableLayouts': ['classic'],\n", " 'canPost': False,\n", " 'canUseNickname': True,\n", " 'createdAt': '2016-05-14T19:15:15.698Z',\n", " 'description': '午夜實驗室10/6、10/7即將在華山登場!這裏提供大家交流活動資訊與討論,請大家要遵守 Dcard 板規喔!',\n", " 'fullyAnonymous': False,\n", " 'id': '7f125e07-4460-4ea5-80b5-33f0e9aafa0c',\n", " 'ignorePost': False,\n", " 'invisible': True,\n", " 'isSchool': False,\n", " 'limitCountries': [],\n", " 'limitStage': 0,\n", " 'mediaThreshold': {},\n", " 'name': '午夜實驗室',\n", " 'nsfw': False,\n", " 'postCount': {'last30Days': 0},\n", " 'postThumbnail': {'size': 'small'},\n", " 'read': False,\n", " 'shouldCategorized': False,\n", " 'subcategories': [],\n", " 'subscribed': False,\n", " 'subscriptionCount': 1837,\n", " 'titlePlaceholder': '',\n", " 'topics': ['午夜實驗室'],\n", " 'updatedAt': '2018-11-05T03:24:32.914Z'}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import json\n", "\n", "with open(\"prac/dcard_forums.json\") as f:\n", " data = json.load(f)\n", "\n", "print(len(data))\n", "data[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Read from string" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "jsonData = \"\"\"\n", "{\"a\":1,\n", "\"b\":2,\n", "\"c\":3,\n", "\"d\":4,\n", "\"e\":5}\n", "\"\"\"\n", "\n", "json.loads(jsonData)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Write JSON\n", "\n", "#### Write to file" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "with open(\"prac/afile.json\", \"w\", encoding=\"utf-8\") as f:\n", " json.dump(data, f, ensure_ascii=False)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'{\"a\": 1, \"b\": 2}'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a_dict = {'a': 1, 'b': 2}\n", "json.dumps(a_dict)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CSV" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read: `DictReader`" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[OrderedDict([('證券代碼', '4938 '),\n", " ('簡稱', '和碩 '),\n", " ('年月日', '20110103'),\n", " ('開盤價(元)', ' 29.32'),\n", " ('最高價(元)', ' 29.42'),\n", " ('最低價(元)', ' 29.21'),\n", " ('收盤價(元)', ' 29.28'),\n", " ('成交值(千元)', ' 89508')]),\n", " OrderedDict([('證券代碼', '4938 '),\n", " ('簡稱', '和碩 '),\n", " ('年月日', '20110104'),\n", " ('開盤價(元)', ' 29.42'),\n", " ('最高價(元)', ' 29.56'),\n", " ('最低價(元)', ' 29.28'),\n", " ('收盤價(元)', ' 29.42'),\n", " ('成交值(千元)', ' 107340')]),\n", " OrderedDict([('證券代碼', '4938 '),\n", " ('簡稱', '和碩 '),\n", " ('年月日', '20110105'),\n", " ('開盤價(元)', ' 29.42'),\n", " ('最高價(元)', ' 29.42'),\n", " ('最低價(元)', ' 28.16'),\n", " ('收盤價(元)', ' 28.72'),\n", " ('成交值(千元)', ' 193077')])]\n" ] } ], "source": [ "import csv\n", "import pprint as pp\n", "\n", "fpath = 'prac/stock.tsv'\n", "file = open(fpath, \"r\", encoding='cp950')\n", "csvFile = csv.DictReader(file, delimiter='\\t')\n", "\n", "pp.pprint(list(csvFile)[:3])\n", "file.close()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4938 89508\n", "4938 107340\n", "4938 193077\n", "4938 259533\n", "4938 330576\n" ] } ], "source": [ "fpath = 'prac/stock.tsv'\n", "file = open(fpath, \"r\", encoding='cp950')\n", "csvFile = csv.DictReader(file, delimiter='\\t')\n", "\n", "lineCount = 1\n", "for row in csvFile:\n", " print(int(row['證券代碼']), int(row['成交值(千元)']))\n", " if lineCount == 5: break \n", " lineCount += 1\n", " \n", "file.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read: `reader`" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "FIELDNAMES: ['證券代碼', '簡稱', '年月日', '開盤價(元)', '最高價(元)', '最低價(元)', '收盤價(元)', '成交值(千元)']\n", "['4938', '和碩', '20110103', '29.32', '29.42', '29.21', '29.28', '89508']\n", "['4938', '和碩', '20110104', '29.42', '29.56', '29.28', '29.42', '107340']\n", "['4938', '和碩', '20110105', '29.42', '29.42', '28.16', '28.72', '193077']\n", "['4938', '和碩', '20110106', '28.93', '29.00', '28.02', '28.20', '259533']\n", "['4938', '和碩', '20110107', '28.02', '28.16', '27.22', '27.32', '330576']\n" ] } ], "source": [ "with open(fpath, 'r', encoding='cp950') as f:\n", " csvFile = csv.reader(f, delimiter='\\t')\n", " header = next(csvFile)\n", " print('FIELDNAMES:', header)\n", " \n", " lineCount = 1\n", " for row in csvFile:\n", " row = [ele.strip() for ele in row]\n", " print(row)\n", " \n", " if lineCount == 5: break\n", " lineCount += 1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read: Pandas" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>證券代碼</th>\n", " <th>簡稱</th>\n", " <th>年月日</th>\n", " <th>開盤價(元)</th>\n", " <th>最高價(元)</th>\n", " <th>最低價(元)</th>\n", " <th>收盤價(元)</th>\n", " <th>成交值(千元)</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>4938</td>\n", " <td>和碩</td>\n", " <td>20110103</td>\n", " <td>29.32</td>\n", " <td>29.42</td>\n", " <td>29.21</td>\n", " <td>29.28</td>\n", " <td>89508</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>4938</td>\n", " <td>和碩</td>\n", " <td>20110104</td>\n", " <td>29.42</td>\n", " <td>29.56</td>\n", " <td>29.28</td>\n", " <td>29.42</td>\n", " <td>107340</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>4938</td>\n", " <td>和碩</td>\n", " <td>20110105</td>\n", " <td>29.42</td>\n", " <td>29.42</td>\n", " <td>28.16</td>\n", " <td>28.72</td>\n", " <td>193077</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4938</td>\n", " <td>和碩</td>\n", " <td>20110106</td>\n", " <td>28.93</td>\n", " <td>29.00</td>\n", " <td>28.02</td>\n", " <td>28.20</td>\n", " <td>259533</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>4938</td>\n", " <td>和碩</td>\n", " <td>20110107</td>\n", " <td>28.02</td>\n", " <td>28.16</td>\n", " <td>27.22</td>\n", " <td>27.32</td>\n", " <td>330576</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>1229</th>\n", " <td>4938</td>\n", " <td>和碩</td>\n", " <td>20151224</td>\n", " <td>66.06</td>\n", " <td>66.40</td>\n", " <td>64.97</td>\n", " <td>65.06</td>\n", " <td>318025</td>\n", " </tr>\n", " <tr>\n", " <th>1230</th>\n", " <td>4938</td>\n", " <td>和碩</td>\n", " <td>20151225</td>\n", " <td>64.89</td>\n", " <td>65.64</td>\n", " <td>64.89</td>\n", " <td>65.64</td>\n", " <td>108529</td>\n", " </tr>\n", " <tr>\n", " <th>1231</th>\n", " <td>4938</td>\n", " <td>和碩</td>\n", " <td>20151228</td>\n", " <td>66.06</td>\n", " <td>66.06</td>\n", " <td>65.31</td>\n", " <td>65.73</td>\n", " <td>141813</td>\n", " </tr>\n", " <tr>\n", " <th>1232</th>\n", " <td>4938</td>\n", " <td>和碩</td>\n", " <td>20151229</td>\n", " <td>65.23</td>\n", " <td>65.23</td>\n", " <td>62.54</td>\n", " <td>63.05</td>\n", " <td>731616</td>\n", " </tr>\n", " <tr>\n", " <th>1233</th>\n", " <td>4938</td>\n", " <td>和碩</td>\n", " <td>20151230</td>\n", " <td>63.30</td>\n", " <td>63.55</td>\n", " <td>60.53</td>\n", " <td>61.37</td>\n", " <td>916988</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>1234 rows × 8 columns</p>\n", "</div>" ], "text/plain": [ " 證券代碼 簡稱 年月日 開盤價(元) 最高價(元) 最低價(元) 收盤價(元) 成交值(千元)\n", "0 4938 和碩 20110103 29.32 29.42 29.21 29.28 89508\n", "1 4938 和碩 20110104 29.42 29.56 29.28 29.42 107340\n", "2 4938 和碩 20110105 29.42 29.42 28.16 28.72 193077\n", "3 4938 和碩 20110106 28.93 29.00 28.02 28.20 259533\n", "4 4938 和碩 20110107 28.02 28.16 27.22 27.32 330576\n", "... ... ... ... ... ... ... ... ...\n", "1229 4938 和碩 20151224 66.06 66.40 64.97 65.06 318025\n", "1230 4938 和碩 20151225 64.89 65.64 64.89 65.64 108529\n", "1231 4938 和碩 20151228 66.06 66.06 65.31 65.73 141813\n", "1232 4938 和碩 20151229 65.23 65.23 62.54 63.05 731616\n", "1233 4938 和碩 20151230 63.30 63.55 60.53 61.37 916988\n", "\n", "[1234 rows x 8 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(\"prac/stock.tsv\", sep=\"\\t\", encoding=\"cp950\")\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Write `PdDataFrame.to_csv()`" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"prac/stock.csv\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Write `PdDataFrame.to_json()`" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>證券代碼</th>\n", " <th>簡稱</th>\n", " <th>年月日</th>\n", " <th>開盤價(元)</th>\n", " <th>最高價(元)</th>\n", " <th>最低價(元)</th>\n", " <th>收盤價(元)</th>\n", " <th>成交值(千元)</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>4938</td>\n", " <td>和碩</td>\n", " <td>20110103</td>\n", " <td>29.32</td>\n", " <td>29.42</td>\n", " <td>29.21</td>\n", " <td>29.28</td>\n", " <td>89508</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>4938</td>\n", " <td>和碩</td>\n", " <td>20110104</td>\n", " <td>29.42</td>\n", " <td>29.56</td>\n", " <td>29.28</td>\n", " <td>29.42</td>\n", " <td>107340</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 證券代碼 簡稱 年月日 開盤價(元) 最高價(元) 最低價(元) 收盤價(元) 成交值(千元)\n", "0 4938 和碩 20110103 29.32 29.42 29.21 29.28 89508\n", "1 4938 和碩 20110104 29.42 29.56 29.28 29.42 107340" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[:2,:]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "df.iloc[:2,:].to_json(\"prac/stock.json\", orient=\"records\", force_ascii=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Details of `to_json()`\n", "\n", "See [doc](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html#pandas.DataFrame.to_json) for details." ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'[{\"證券代碼\":4938,\"簡稱\":\"和碩 \",\"年月日\":20110103,\"開盤價(元)\":29.32,\"最高價(元)\":29.42,\"最低價(元)\":29.21,\"收盤價(元)\":29.28,\"成交值(千元)\":89508},{\"證券代碼\":4938,\"簡稱\":\"和碩 \",\"年月日\":20110104,\"開盤價(元)\":29.42,\"最高價(元)\":29.56,\"最低價(元)\":29.28,\"收盤價(元)\":29.42,\"成交值(千元)\":107340}]'" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "json_str = df.iloc[:2,:].to_json(orient=\"records\", force_ascii=False)\n", "json_str" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'年月日': 20110103,\n", " '成交值(千元)': 89508,\n", " '收盤價(元)': 29.28,\n", " '最低價(元)': 29.21,\n", " '最高價(元)': 29.42,\n", " '簡稱': '和碩 ',\n", " '證券代碼': 4938,\n", " '開盤價(元)': 29.32},\n", " {'年月日': 20110104,\n", " '成交值(千元)': 107340,\n", " '收盤價(元)': 29.42,\n", " '最低價(元)': 29.28,\n", " '最高價(元)': 29.56,\n", " '簡稱': '和碩 ',\n", " '證券代碼': 4938,\n", " '開盤價(元)': 29.42}]" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "json.loads(json_str)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 2 }