-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathDblpGraph.py
More file actions
executable file
·81 lines (63 loc) · 2.66 KB
/
DblpGraph.py
File metadata and controls
executable file
·81 lines (63 loc) · 2.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) Sal Aguinaga 2015
import sys, os
import argparse, re
import pandas as pd
import numpy as np
# print mdf.head()
# #print author.head()
#
# article = pd.read_csv(article_path, index_col=0, na_filter=True,engine='c',
# dtype={"paper": str,"mdate": str,"publtype": str,"reviewid": str,
# "rating": str,"type": str,"title": str,"booktitle": str,
# "pages": str,"year": str,"address": str,"journal": str,
# "volume": str,"number": str,"month": str,"url": str,
# "ee": str,"cdrom": str,"publisher": str,"note": str,
# "isbn": str,"series": str,"school": str,"chapter": str})
#
# #article['title'] = article['title'].map(lambda x: x.lstrip('\n').rstrip('<title>'))
# ## Sampling the dataframe
# article['title'] = article['title'].map(lambda x: x.lstrip('<title>').rstrip('\n').rstrip('</title>') if isinstance(x, basestring) else None)
# print author.columns
# print article.columns, '\n','-'*80
#
# print author.head()
# print article.head()
# mdf = pd.DataFrame.join(article, author)
# print 'article, author, merged', article.shape, author.shape, mdf.shape
#
# print mdf.head()
# print mdf.tail()
#
# print
# article['title'] = article['title'].map(lambda x: x.lstrip('<title>') if isinstance(x, str) else None)
#rows = np.random.choice(article.index.values,10)
# sampled_df = article.ix[rows]
# sampled_df['title'] = sampled_df['title'].map(lambda x: x.lstrip('<title>').rstrip('\n').rstrip('</title>'))
def main():
parser = argparse.ArgumentParser(description='Generate a txt file for a graph')
parser.add_argument('file_path', help='input file path and name', action='store')
args = parser.parse_args()
if __name__ =='__main__':
#main()
author_path = "/data/bshi/dblp/author.csv"
article_path = "/data/bshi/dblp/article.csv"
field_file = "data/fieldFile.txt"
author = pd.read_csv(author_path)
with open(field_file) as f:
p_t_dict = dict()
for line in f:
lparts = re.split(r'\t\d*\ttitle\t+',line.rstrip())
if (len(lparts)>1):
p_t_dict[lparts[0]] = lparts[1]
## turn dict to pandas data frame to perform a merge
paper_title_df = pd.DataFrame.from_dict(p_t_dict.items())
paper_title_df.columns=['paper','title']
## merge dataframes
mdf = pd.DataFrame.merge(paper_title_df, author, on="paper", how="inner")
print author.shape, paper_title_df.shape, mdf.shape
print mdf.head()
## output dataframe to disk
mdf.to_csv("data/dblp_author_paper_title.csv", sep=',', mode='w', header=True, index=False)
print "\n","-"*80,"\nDone"