{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from copy import deepcopy\n", "import numpy as np\n", "import pandas as pd\n", "import calendar\n", "from pybliometrics.scopus import ScopusSearch\n", "from pybliometrics.scopus import AbstractRetrieval\n", "\n", "from tqdm import tqdm\n", "\n", "pd.options.display.max_columns = 30\n", "\n", "filename = \"./sample.txt\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "### language\n", "dic_language = {'eng': \"English\",\n", " 'kor': \"Korean\"\n", " }\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1028, 34)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eiddoipiipubmed_idtitlesubtypesubtypeDescriptioncreatorafidaffilnameaffiliation_cityaffiliation_countryauthor_countauthor_namesauthor_ids...issnsource_ideIssnaggregationTypevolumeissueIdentifierarticle_numberpageRangedescriptionauthkeywordscitedby_countopenaccessfund_acrfund_nofund_sponsor
02-s2.0-8512514579610.1016/j.egyr.2022.02.088S2352484722003353NoneData driven approach to forecast the next day ...arArticleFentis A.60017798;127756529;116607338Faculté des Sciences et Techniques de Mohammed...Mohammedia;Mohammedia;CasablancaMorocco;Morocco;Morocco5Fentis, Ayoub;Rafik, Mohamed;Bahatti, Lhoussai...57195404384;57208745688;55837734100;6506839198......None2110038951123524847Journal8NoneNone3221-3233Photovoltaic (PV) power has became an attracti...Machine learning | PV power forecasting | Rene...01NoneundefinedNone
12-s2.0-8512367614710.1016/j.egyr.2022.01.105S2352484722001056NoneEnergy modeling and photovoltaics integration ...arArticleAl Huneidi D.I.60113885Hamad Bin Khalifa University, College of Scien...DohaQatar3Al Huneidi, Dana I.;Tahir, Furqan;Al-Ghamdi, S...57389610600;57201605786;56439247500...None2110038951123524847Journal8NoneNone166-171Due to Qatar's increasing population and elect...Climate change | Demand side management | Desi...11QNRFNPRP12S-0212-190073Qatar National Research Fund
22-s2.0-8512440987410.1016/j.engappai.2022.104707S0952197622000288NoneMulti-quantile recurrent neural network for fe...arArticleZhang X.Y.60020595;60001455Royal Holloway, University of London;Anhui Uni...Egham;HefeiUnited Kingdom;China3Zhang, Xiao Yu;Watkins, Chris;Kuenzel, Stefanie57239241400;57447667900;55875409300...0952197624182NoneJournal110None104707NoneThe purpose of feeder-level energy disaggregat...Behind-the-meter PV generation | Deep neural n...00NoneundefinedNone
\n", "

3 rows × 34 columns

\n", "
" ], "text/plain": [ " eid doi pii \\\n", "0 2-s2.0-85125145796 10.1016/j.egyr.2022.02.088 S2352484722003353 \n", "1 2-s2.0-85123676147 10.1016/j.egyr.2022.01.105 S2352484722001056 \n", "2 2-s2.0-85124409874 10.1016/j.engappai.2022.104707 S0952197622000288 \n", "\n", " pubmed_id title subtype \\\n", "0 None Data driven approach to forecast the next day ... ar \n", "1 None Energy modeling and photovoltaics integration ... ar \n", "2 None Multi-quantile recurrent neural network for fe... ar \n", "\n", " subtypeDescription creator afid \\\n", "0 Article Fentis A. 60017798;127756529;116607338 \n", "1 Article Al Huneidi D.I. 60113885 \n", "2 Article Zhang X.Y. 60020595;60001455 \n", "\n", " affilname \\\n", "0 Faculté des Sciences et Techniques de Mohammed... \n", "1 Hamad Bin Khalifa University, College of Scien... \n", "2 Royal Holloway, University of London;Anhui Uni... \n", "\n", " affiliation_city affiliation_country author_count \\\n", "0 Mohammedia;Mohammedia;Casablanca Morocco;Morocco;Morocco 5 \n", "1 Doha Qatar 3 \n", "2 Egham;Hefei United Kingdom;China 3 \n", "\n", " author_names \\\n", "0 Fentis, Ayoub;Rafik, Mohamed;Bahatti, Lhoussai... \n", "1 Al Huneidi, Dana I.;Tahir, Furqan;Al-Ghamdi, S... \n", "2 Zhang, Xiao Yu;Watkins, Chris;Kuenzel, Stefanie \n", "\n", " author_ids ... issn \\\n", "0 57195404384;57208745688;55837734100;6506839198... ... None \n", "1 57389610600;57201605786;56439247500 ... None \n", "2 57239241400;57447667900;55875409300 ... 09521976 \n", "\n", " source_id eIssn aggregationType volume issueIdentifier \\\n", "0 21100389511 23524847 Journal 8 None \n", "1 21100389511 23524847 Journal 8 None \n", "2 24182 None Journal 110 None \n", "\n", " article_number pageRange \\\n", "0 None 3221-3233 \n", "1 None 166-171 \n", "2 104707 None \n", "\n", " description \\\n", "0 Photovoltaic (PV) power has became an attracti... \n", "1 Due to Qatar's increasing population and elect... \n", "2 The purpose of feeder-level energy disaggregat... \n", "\n", " authkeywords citedby_count openaccess \\\n", "0 Machine learning | PV power forecasting | Rene... 0 1 \n", "1 Climate change | Demand side management | Desi... 1 1 \n", "2 Behind-the-meter PV generation | Deep neural n... 0 0 \n", "\n", " fund_acr fund_no fund_sponsor \n", "0 None undefined None \n", "1 QNRF NPRP12S-0212-190073 Qatar National Research Fund \n", "2 None undefined None \n", "\n", "[3 rows x 34 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_sample = pd.read_pickle(\"./df_sample.pkl\")\n", "print(df_sample.shape)\n", "df_sample.head(3)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "columns=[\"eid\", \"PT\", \"AU\", \"AF\", \"TI\", \n", " \"SO\", \"SO_abb\", \"LA\", \"DT\", \"DE\", \n", " \"ID\", \"AB\", \"C1\", \"RP\", \"EM\", \n", " \"CR\", \"NR\", \"TC\", \"Z9\", \"SN\", \n", " \"J9\", \"JI\", \"PD\", \"PY\", \"VL\", \n", " \"AR\", \"DI\", \"SC\"]\n", "\n", "df_ab = pd.DataFrame(columns=columns)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "looping over df_sample: 100%|██████████| 1028/1028 [00:17<00:00, 58.24it/s]\n" ] } ], "source": [ "starting_index = 0\n", "data_index = list(range(starting_index, df_sample.shape[0]))\n", "\n", "for art in tqdm(data_index, desc=\"looping over df_sample\"):\n", "\n", " eid = df_sample.loc[art, \"eid\"]\n", " ab = AbstractRetrieval(eid, view=\"FULL\")\n", "\n", " # 1. PT: publication type\n", " docu_type_ = ab.srctype\n", "\n", " # 2. AU: index name\n", " index_name_ = [a.indexed_name for a in ab.authors] if ab.authors else [None]\n", "\n", " # 3. AF: author name\n", " author_name_ = [f\"{a.surname}, {a.given_name}\" for a in ab.authors] if ab.authors else [None]\n", "\n", " # 4. TI: document title\n", " docu_title_ = df_sample.loc[art, \"title\"]\n", "\n", " # 5. SO: publication name\n", " src_title_ = ab.publicationName\n", "\n", " # 5-1. publication abbr.\n", " src_abb_ = ab.sourcetitle_abbreviation\n", "\n", " # 6. LA : Language\n", " try:\n", " language_ = dic_language[ab.language]\n", " except:\n", " language_ = \"unknown\"\n", "\n", " # 7. DT : Document Type\n", "# docu_type_ = ab.subtype\n", " docu_type_ = df_sample.loc[art, \"subtype\"]\n", "\n", " # 8. DE : Author Keywords\n", " auth_kw_ = ab.authkeywords\n", "\n", " if not auth_kw_:\n", " auth_kw_ = 'None'\n", " else:\n", " auth_kw_ = '; '.join(auth_kw_)\n", "\n", " # 9. ID : Keyword Plus\n", " kw_plus_ = 'None'\n", "\n", " # 10. AB : Abstract\n", " abstract_ = ab.abstract\n", "\n", " # 11. C1 : Author Address\n", "# if ab.authorgroup:\n", "# tmp = pd.DataFrame(ab.authorgroup)\n", "# grouped = tmp.groupby('organization').agg(lambda x: list(x))\n", "\n", "# aff_names = [str(aff) for aff in grouped[\"affiliation_id\"].index.tolist()]\n", "# aff_ids = grouped[\"affiliation_id\"].tolist()\n", "# citys = grouped[\"city\"].tolist()\n", "# countrys = grouped[\"country\"].tolist()\n", "# auids = grouped[\"auid\"].tolist()\n", "# indexed_names = grouped[\"indexed_name\"].tolist()\n", "\n", "# address = []\n", "# for aff_name, aff_id, city, country in zip(aff_names, aff_ids, citys, countrys):\n", "# if isinstance(aff_id, list):\n", "# aff_id, city, country = aff_id[0], city[0], country[0]\n", "# address.append(f\"{aff_id}, {aff_name}, {city}, {country}\")\n", "\n", "# addresss_ = list(zip(auids, indexed_names, address))\n", " if ab.affiliation and ab.authors:\n", " df_aff = pd.DataFrame(ab.affiliation)\n", " df_aff[\"id\"] = df_aff[\"id\"].astype(str)\n", " df_auth = pd.DataFrame(ab.authors)\n", " df_authgroup = df_auth.explode(\"affiliation\").groupby(\"affiliation\").agg(list).reset_index()\n", "\n", " df_aff = pd.merge(df_aff, df_authgroup, left_on=\"id\", right_on=\"affiliation\").drop([\"affiliation\"], axis=1)\n", " df_aff[\"address\"] = df_aff[['id', 'name', 'city', 'country']].apply(lambda x: ', '.join(x.astype(str)), axis=1)\n", " addresss_ = df_aff[[\"auid\", \"indexed_name\", \"address\"]].values.tolist()\n", " else:\n", " addresss_ = []\n", "\n", " # df_aff = pd.DataFrame(ab.authorgroup)\n", " # grouped = df_aff.groupby(\"affiliation_id\")\n", " # grouped[[\"organization\", \"affiliation_id\", \"city\", \"country\"]]\n", "\n", " # 12. RP : Reprint Address\n", " rep_addr_ = \"None\"\n", "\n", " # 13. EM : E-mail Address\n", " em_addr_ = \"None\"\n", "\n", " # 14. CR : Cited References\n", " refs_ = []\n", " if ab.references != None:\n", " tmp = pd.DataFrame(ab.references)\n", " refcount = int(ab.refcount)\n", "\n", " for i in range(refcount):\n", " tmp_ = tmp.iloc[i]\n", " tmp_authors = tmp_['authors']\n", " if tmp_authors == None:\n", " tmp_authors = \"[Anonymous]\"\n", " tmp_year = tmp_['publicationyear']\n", " tmp_src = tmp_['sourcetitle']\n", " tmp_vol = tmp_['volume']\n", " tmp_page = tmp_['first']\n", " tmp_doi = tmp_['doi']\n", "\n", " ref = tmp_authors\n", " for item in [tmp_year, tmp_src, tmp_vol, tmp_page]:\n", " if item != None:\n", " ref = ', '.join([ref, item])\n", " if tmp_doi != None:\n", " ref = ref + f\", DOI {tmp_doi}\"\n", "\n", " if i == 0:\n", " refs_.append(f\"CR {ref}\")\n", " else:\n", " refs_.append(f\" {ref}\")\n", "\n", " # 15. NR : Cited Reference Count\n", " nr_ = ab.refcount\n", "\n", " # 16. TC : Web of Science Core Collection Times Cited Count\n", " tc_ = ab.citedby_count\n", "\n", " # 17. Z9 : Total Times Cited Count\n", " cc_ = tc_\n", "\n", " # 18. U1 : Usage Count (Last 180 Days)\n", " # 19. U2 : Usage Count (Since 2013)\n", " # 20. PU : Publisher = ELSEVIER SCI LTD\n", " # 21. PI : Publisher City = OXFORD\n", " # 22. PA : Publisher Address = THE BOULEVARD, LANGFORD LANE, KIDLINGTON, OXFORD OX5 1GB, OXON, ENGLAND\n", " # 23. SN : International Standard Serial Number (ISSN) = 0959-6526\n", " sn_ = \"None\" if ab.issn == None else ab.issn\n", "\n", " # 24. EI : Electronic International Standard Serial Number (eISSN) = 1879-1786\n", " # 25. J9 : 29-Character Source Abbreviation = J CLEAN PROD\n", " j9_ = ab.sourcetitle_abbreviation if ab.sourcetitle_abbreviation != None else \"None\"\n", " j9_ = j9_.upper()\n", "\n", " # 26. JI : ISO Source Abbreviation = J. Clean Prod.\n", " ji_ = ab.sourcetitle_abbreviation if ab.sourcetitle_abbreviation != None else \"None\"\n", "\n", " # 27. PD : Publication Date = JUL 1\n", " month = ab.coverDate.split('-')[1]\n", " date = ab.coverDate.split('-')[2]\n", " pd_ = f\"{calendar.month_name[int(month)][:3].upper()} {int(date)}\"\n", "\n", " # 28. PY : Publication Year = 2020\n", " py_ = ab.coverDate.split('-')[0]\n", "\n", " # 29. VL : Volumn = 260\n", " vl_ = ab.volume\n", "\n", " # 30. AR : Article Number = 121059\n", " ar_ = df_sample.loc[art, \"article_number\"] if df_sample.loc[art, \"article_number\"] != None else \"None\"\n", "\n", " # 31. DI : Digital Object Identifier = 10.1016/j.jclepro.2020.121059\n", " doi_ = ab.doi\n", "\n", " # 32. PG : Page Count = 14\n", " # 33. WC : Web of Science Categories = Green & Sustainable Science & Technology; Engineering, Environmental; Environmental Sciences\n", " # 34. SC : Research Areas = Science & Technology - Other Topics; Engineering; Environmental Sciences & Ecology\n", " if ab.subject_areas: \n", " tmp = pd.DataFrame(ab.subject_areas)\n", " tmp_ = tmp[\"area\"].tolist()\n", " sc_ = \"; \".join(tmp_)\n", " else: \n", " sc_ = [None]\n", "\n", " # 35. GA : Document Delivery Number = LL4XH\n", " # 36. UT : Accession Number = WOS:000531559900003\n", " # 37. DA : Date this report was generated. = 2020-06-14\n", " \n", " # summation\n", " data=[eid, docu_type_, index_name_, author_name_, docu_title_, \n", " src_title_, src_abb_, language_, docu_type_, auth_kw_, \n", " kw_plus_, abstract_, addresss_, rep_addr_, em_addr_, \n", " refs_, nr_, tc_, cc_, sn_, \n", " j9_, ji_, pd_, py_, vl_, \n", " ar_, doi_, sc_]\n", " \n", " df_ab_tmp = pd.DataFrame(dict(zip(columns, [[d] for d in data])))\n", " df_ab = pd.concat([df_ab, df_ab_tmp], axis=0)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1028, 28)" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_ab.reset_index(drop=True).to_pickle(\"df_ab.pkl\")\n", "df_ab.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 4 }